Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8-test Changeset: r93330:a31f4ea5722a Date: 2017-12-09 14:04 +0000 http://bitbucket.org/pypy/pypy/changeset/a31f4ea5722a/
Log: hg merge unicode-utf8 diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -70,9 +70,6 @@ raise oefmt(space.w_IndexError, "position %d from error handler out of bounds", newpos) - if newpos < startpos: - raise oefmt(space.w_IndexError, - "position %d from error handler did not progress", newpos) w_replace = space.convert_to_w_unicode(w_replace) return w_replace._utf8, newpos return call_errorhandler @@ -226,7 +223,7 @@ w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) - end = w_obj._index_to_byte(end) + end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start obj = w_obj._utf8 diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py --- a/pypy/module/array/interp_array.py +++ b/pypy/module/array/interp_array.py @@ -380,6 +380,7 @@ if len(s) % self.itemsize != 0: raise oefmt(self.space.w_ValueError, "string length not a multiple of item size") + self.check_valid_unicode(space, s) # empty for non-u arrays oldlen = self.len new = len(s) / self.itemsize if not new: @@ -710,6 +711,9 @@ s = "array('%s', %s)" % (self.typecode, space.text_w(r)) return space.newtext(s) + def check_valid_unicode(self, space, s): + pass # overwritten by u + W_ArrayBase.typedef = TypeDef( 'array.array', __new__ = interp2app(w_array), @@ -870,6 +874,18 @@ def get_buffer(self): return rffi.cast(mytype.arrayptrtype, self._buffer) + if mytype.unwrap == 'utf8_len_w': + def check_valid_unicode(self, space, s): + i = 0 + while i < len(s): + if s[i] != '\x00' or ord(s[i + 1]) > 0x10: + v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) + + (ord(s[i + 2]) << 8) + ord(s[i + 3])) + raise oefmt(space.w_ValueError, + "Character U+%s is not in range [U+0000, U+10ffff]", + hex(v)[2:]) + i += 4 + def item_w(self, w_item): space = self.space unwrap = getattr(space, mytype.unwrap) diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py --- a/pypy/module/array/test/test_array.py +++ b/pypy/module/array/test/test_array.py @@ -844,13 +844,7 @@ import sys if sys.maxunicode == 0xffff: skip("test for 32-bit unicodes") - a = self.array('u', b'\xff\xff\xff\xff') - assert len(a) == 1 - assert repr(a[0]) == "u'\Uffffffff'" - if sys.maxint == 2147483647: - assert ord(a[0]) == -1 - else: - assert ord(a[0]) == 4294967295 + raises(ValueError, self.array, 'u', b'\xff\xff\xff\xff') def test_weakref(self): import weakref diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py --- a/pypy/module/pyexpat/interp_pyexpat.py +++ b/pypy/module/pyexpat/interp_pyexpat.py @@ -587,21 +587,22 @@ def UnknownEncodingHandler(self, space, name, info): # Yes, supports only 8bit encodings - translationmap = space.unicode_w( + translationmap, lgt = space.utf8_len_w( space.call_method( space.newbytes(self.all_chars), "decode", space.newtext(name), space.newtext("replace"))) - if len(translationmap) != 256: + if lgt != 256: raise oefmt(space.w_ValueError, "multi-byte encodings are not supported") - for i in range(256): - c = translationmap[i] - if c == u'\ufffd': + i = 0 + for c in rutf8.Utf8StringIterator(translationmap): + if c == 0xfffd: info.c_map[i] = rffi.cast(rffi.INT, -1) else: info.c_map[i] = rffi.cast(rffi.INT, c) + i += 1 info.c_data = lltype.nullptr(rffi.VOIDP.TO) info.c_convert = lltype.nullptr(rffi.VOIDP.TO) info.c_release = lltype.nullptr(rffi.VOIDP.TO) diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py --- a/pypy/module/unicodedata/interp_ucd.py +++ b/pypy/module/unicodedata/interp_ucd.py @@ -7,11 +7,8 @@ from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.typedef import TypeDef, interp_attrproperty from rpython.rlib.rarithmetic import r_longlong -from rpython.rlib.objectmodel import we_are_translated -from rpython.rlib.runicode import MAXUNICODE from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0 -from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate -import sys +from rpython.rlib.rutf8 import Utf8StringBuilder, unichr_as_utf8 # Contants for Hangul characters @@ -30,49 +27,17 @@ # unicode code point. -if MAXUNICODE > 0xFFFF: - # Target is wide build - def unichr_to_code_w(space, w_unichr): - if not space.isinstance_w(w_unichr, space.w_unicode): - raise oefmt( - space.w_TypeError, 'argument 1 must be unicode, not %T', - w_unichr) +# Target is wide build +def unichr_to_code_w(space, w_unichr): + if not space.isinstance_w(w_unichr, space.w_unicode): + raise oefmt( + space.w_TypeError, 'argument 1 must be unicode, not %T', + w_unichr) - if not we_are_translated() and sys.maxunicode == 0xFFFF: - # Host CPython is narrow build, accept surrogates - try: - return ord_accepts_surrogate(space.unicode_w(w_unichr)) - except TypeError: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - else: - if not space.len_w(w_unichr) == 1: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - return space.int_w(space.ord(w_unichr)) - -else: - # Target is narrow build - def unichr_to_code_w(space, w_unichr): - if not space.isinstance_w(w_unichr, space.w_unicode): - raise oefmt( - space.w_TypeError, 'argument 1 must be unicode, not %T', - w_unichr) - - if not we_are_translated() and sys.maxunicode > 0xFFFF: - # Host CPython is wide build, forbid surrogates - if not space.len_w(w_unichr) == 1: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - return space.int_w(space.ord(w_unichr)) - - else: - # Accept surrogates - try: - return ord_accepts_surrogate(space.unicode_w(w_unichr)) - except TypeError: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") + if not space.len_w(w_unichr) == 1: + raise oefmt(space.w_TypeError, + "need a single Unicode character as parameter") + return space.int_w(space.ord(w_unichr)) class UCD(W_Root): @@ -110,7 +75,7 @@ except KeyError: msg = space.mod(space.newtext("undefined character name '%s'"), space.newtext(name)) raise OperationError(space.w_KeyError, msg) - return space.newunicode(code_to_unichr(code)) + return space.newutf8(unichr_as_utf8(code), 1) def name(self, space, w_unichr, w_default=None): code = unichr_to_code_w(space, w_unichr) @@ -259,10 +224,10 @@ result[0] = ch if not composed: # If decomposed normalization we are done - return space.newunicode(u''.join([unichr(i) for i in result[:j]])) + return self.build(space, result, stop=j) if j <= 1: - return space.newunicode(u''.join([unichr(i) for i in result[:j]])) + return self.build(space, result, stop=j) current = result[0] starter_pos = 0 @@ -310,7 +275,13 @@ result[starter_pos] = current - return space.newunicode(u''.join([unichr(i) for i in result[:next_insert]])) + return self.build(space, result, stop=next_insert) + + def build(self, space, r, stop): + builder = Utf8StringBuilder(stop * 3) + for i in range(stop): + builder.append_code(r[i]) + return space.newutf8(builder.build(), stop) methods = {} diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py --- a/pypy/module/unicodedata/test/test_hyp.py +++ b/pypy/module/unicodedata/test/test_hyp.py @@ -1,3 +1,4 @@ + import pytest try: from hypothesis import given, strategies as st, example, settings @@ -5,12 +6,14 @@ pytest.skip("hypothesis required") from pypy.module.unicodedata.interp_ucd import ucd +from rpython.rlib.rutf8 import get_utf8_length def make_normalization(space, NF_code): def normalize(s): - w_s = space.newunicode(s) + u = s.encode('utf8') + w_s = space.newutf8(u, get_utf8_length(u)) w_res = ucd.normalize(space, NF_code, w_s) - return space.unicode_w(w_res) + return space.utf8_w(w_res).decode('utf8') return normalize all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD'] _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit