Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: py3.6 Changeset: r97486:a94f909131d8 Date: 2019-09-16 10:29 +0200 http://bitbucket.org/pypy/pypy/changeset/a94f909131d8/
Log: merge default diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -31,7 +31,7 @@ # Fast version of the "strict" errors handler. def raise_unicode_exception_encode(errors, encoding, msg, utf8, startingpos, endingpos): - u_len = rutf8.get_utf8_length(utf8) + u_len = rutf8.codepoints_in_utf8(utf8) raise OperationError(space.w_UnicodeEncodeError, space.newtuple([space.newtext(encoding), space.newutf8(utf8, u_len), diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py --- a/pypy/module/_io/interp_stringio.py +++ b/pypy/module/_io/interp_stringio.py @@ -1,4 +1,4 @@ -from rpython.rlib.rutf8 import get_utf8_length, next_codepoint_pos +from rpython.rlib.rutf8 import codepoints_in_utf8, next_codepoint_pos from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.typedef import ( @@ -98,7 +98,7 @@ return result def write(self, string): - length = get_utf8_length(string) + length = codepoints_in_utf8(string) if self.pos + length > len(self.data): self.resize(self.pos + length) pos = 0 @@ -173,7 +173,7 @@ if readnl is None: w_readnl = space.w_None else: - w_readnl = space.str(space.newutf8(readnl, get_utf8_length(readnl))) # YYY + w_readnl = space.str(space.newutf8(readnl, codepoints_in_utf8(readnl))) # YYY return space.newtuple([ w_initialval, w_readnl, space.newint(self.buf.pos), w_dict ]) @@ -239,7 +239,7 @@ w_decoded = space.call_method( w_decoded, "replace", space.newtext("\n"), - space.newutf8(writenl, get_utf8_length(writenl)), + space.newutf8(writenl, codepoints_in_utf8(writenl)), ) string = space.utf8_w(w_decoded) if string: @@ -251,7 +251,7 @@ self._check_closed(space) size = convert_size(space, w_size) v = self.buf.read(size) - lgt = get_utf8_length(v) + lgt = codepoints_in_utf8(v) return space.newutf8(v, lgt) def readline_w(self, space, w_limit=None): @@ -266,7 +266,7 @@ else: newline = self.readnl result = self.buf.readline(newline, limit) - resultlen = get_utf8_length(result) + resultlen = codepoints_in_utf8(result) return space.newutf8(result, resultlen) @@ -305,7 +305,7 @@ def getvalue_w(self, space): self._check_closed(space) v = self.buf.getvalue() - lgt = get_utf8_length(v) + lgt = codepoints_in_utf8(v) return space.newutf8(v, lgt) def readable_w(self, space): diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -12,7 +12,7 @@ from rpython.rlib.rbigint import rbigint from rpython.rlib.rstring import StringBuilder from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos, - codepoints_in_utf8, get_utf8_length, + codepoints_in_utf8, codepoints_in_utf8, Utf8StringBuilder) @@ -905,7 +905,7 @@ haslf = True if haslf and self.writetranslate and self.writenl: w_text = space.call_method(w_text, "replace", space.newutf8('\n', 1), - space.newutf8(self.writenl, get_utf8_length(self.writenl))) + space.newutf8(self.writenl, codepoints_in_utf8(self.writenl))) text = space.utf8_w(w_text) needflush = False diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -157,7 +157,7 @@ replace, end, rettype = errorcb(errors, namecb, reason, stringdata, start, end) # 'replace' is UTF8 encoded unicode, rettype is 'u' - lgt = rutf8.get_utf8_length(replace) + lgt = rutf8.codepoints_in_utf8(replace) inbuf = rffi.utf82wcharp(replace, lgt) try: r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end) diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -67,7 +67,7 @@ pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] - lgt = rutf8.get_utf8_length(output) + lgt = rutf8.codepoints_in_utf8(output) return space.newutf8(output, lgt) diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -27,7 +27,7 @@ raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) - lgt = rutf8.get_utf8_length(utf8_output) + lgt = rutf8.codepoints_in_utf8(utf8_output) return space.newtuple([space.newutf8(utf8_output, lgt), space.newint(len(input))]) diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py --- a/pypy/module/_multibytecodec/test/test_translation.py +++ b/pypy/module/_multibytecodec/test/test_translation.py @@ -14,7 +14,7 @@ codecname, string = argv[1], argv[2] c = c_codecs.getcodec(codecname) u = c_codecs.decode(c, string) - lgt = rutf8.get_utf8_length(u) + lgt = rutf8.codepoints_in_utf8(u) r = c_codecs.encode(c, u, lgt) print r return 0 diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -49,7 +49,7 @@ return space.newbytes(ctx._string[start:end]) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): s = ctx._utf8[start:end] - lgt = rutf8.get_utf8_length(s) + lgt = rutf8.codepoints_in_utf8(s) return space.newutf8(s, lgt) else: # unreachable @@ -496,7 +496,7 @@ elif use_builder == 'U': assert isinstance(ctx, rsre_utf8.Utf8MatchContext) return space.newutf8(result_bytes, - rutf8.get_utf8_length(result_bytes)), n + rutf8.codepoints_in_utf8(result_bytes)), n else: raise AssertionError(use_builder) else: @@ -788,7 +788,7 @@ elif isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): - lgt = rutf8.get_utf8_length(ctx._utf8) + lgt = rutf8.codepoints_in_utf8(ctx._utf8) return space.newutf8(ctx._utf8, lgt) else: raise SystemError diff --git a/pypy/module/micronumpy/boxes.py b/pypy/module/micronumpy/boxes.py --- a/pypy/module/micronumpy/boxes.py +++ b/pypy/module/micronumpy/boxes.py @@ -11,7 +11,7 @@ from rpython.rlib.rstring import StringBuilder from rpython.rlib.objectmodel import specialize from rpython.rlib import jit -from rpython.rlib.rutf8 import get_utf8_length +from rpython.rlib.rutf8 import codepoints_in_utf8 from rpython.rtyper.lltypesystem import lltype, rffi from rpython.tool.sourcetools import func_with_new_name from pypy.module.micronumpy import constants as NPY @@ -629,7 +629,7 @@ return self elif dtype.is_object(): return W_ObjectBox(space.newutf8(self._value, - get_utf8_length(self._value))) + codepoints_in_utf8(self._value))) else: raise oefmt(space.w_NotImplementedError, "Conversion from unicode not implemented yet") diff --git a/pypy/module/micronumpy/types.py b/pypy/module/micronumpy/types.py --- a/pypy/module/micronumpy/types.py +++ b/pypy/module/micronumpy/types.py @@ -1,7 +1,7 @@ import functools import math from rpython.rlib.unroll import unrolling_iterable -from rpython.rlib.rutf8 import Utf8StringIterator, get_utf8_length, Utf8StringBuilder +from rpython.rlib.rutf8 import Utf8StringIterator, codepoints_in_utf8, Utf8StringBuilder from pypy.interpreter.error import OperationError, oefmt from pypy.objspace.std.floatobject import float2string from pypy.objspace.std.complexobject import str_format @@ -2330,7 +2330,7 @@ def to_builtin_type(self, space, box): assert isinstance(box, boxes.W_UnicodeBox) - return space.newutf8(box._value, get_utf8_length(box._value)) + return space.newutf8(box._value, codepoints_in_utf8(box._value)) def eq(self, v1, v2): assert isinstance(v1, boxes.W_UnicodeBox) diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py --- a/pypy/module/unicodedata/test/test_hyp.py +++ b/pypy/module/unicodedata/test/test_hyp.py @@ -6,12 +6,12 @@ pytest.skip("hypothesis required") from pypy.module.unicodedata.interp_ucd import ucd -from rpython.rlib.rutf8 import get_utf8_length +from rpython.rlib.rutf8 import codepoints_in_utf8 def make_normalization(space, NF_code): def normalize(s): u = s.encode('utf8') - w_s = space.newutf8(u, get_utf8_length(u)) + w_s = space.newutf8(u, codepoints_in_utf8(u)) w_res = ucd.normalize(space, NF_code, w_s) return space.utf8_w(w_res).decode('utf8') return normalize diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -363,25 +363,12 @@ raise CheckError(~res) def get_utf8_length(s, start=0, end=-1): + # DEPRECATED! use codepoints_in_utf8 instead """ Get the length out of valid utf8. """ if end < 0: end = len(s) - res = 0 - pos = start - while pos < end: - ordch1 = ord(s[pos]) - res += 1 - if ordch1 <= 0x7F: - pos += 1 - elif ordch1 <= 0xDF: - pos += 2 - elif ordch1 <= 0xEF: - pos += 3 - elif ordch1 <= 0xF4: - pos += 4 - - return res + return codepoints_in_utf8(s, start, end) @jit.elidable def _check_utf8(s, allow_surrogates, start, stop): @@ -761,13 +748,13 @@ def append(self, s): # for strings self._s.append(s) - newlgt = get_utf8_length(s) + newlgt = codepoints_in_utf8(s) self._lgt += newlgt @always_inline def append_slice(self, s, start, end): self._s.append_slice(s, start, end) - newlgt = get_utf8_length(s, start, end) + newlgt = codepoints_in_utf8(s, start, end) self._lgt += newlgt @signature(types.self(), char(), returns=none()) diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -169,14 +169,6 @@ expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff') assert result == expected -@given(strategies.lists(strategies.characters())) -def test_get_utf8_length(unichars): - u = u''.join(unichars) - exp_lgt = len(u) - s = ''.join([c.encode('utf8') for c in u]) - lgt = rutf8.get_utf8_length(s) - if not _has_surrogates(s) or sys.maxunicode > 0xffff: - assert lgt == exp_lgt def test_utf8_string_builder(): s = rutf8.Utf8StringBuilder() _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit