Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: Changeset: r97434:2352eded240c Date: 2019-09-11 13:42 +0200 http://bitbucket.org/pypy/pypy/changeset/2352eded240c/
Log: backport eee2717be5e2 to default: more improvement to the performance of _io: make get_chars track the number of unicode codepoints. also fix a bug in W_TextIOWrapper._read that assumed ascii diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -293,14 +293,18 @@ class DecodeBuffer(object): - def __init__(self, text=None): + def __init__(self, text=None, ulen=-1): # self.text is a valid utf-8 string + if text is not None: + assert ulen >= 0 self.text = text self.pos = 0 self.upos = 0 + self.ulen = ulen def set(self, space, w_decoded): check_decoded(space, w_decoded) + self.ulen = space.len_w(w_decoded) self.text = space.utf8_w(w_decoded) self.pos = 0 self.upos = 0 @@ -309,13 +313,14 @@ self.text = None self.pos = 0 self.upos = 0 + self.ulen = -1 def get_chars(self, size): """ returns a tuple (utf8, lgt) """ if self.text is None or size == 0: - return "" + return "", 0 - lgt = codepoints_in_utf8(self.text) + lgt = self.ulen available = lgt - self.upos if size < 0 or size > available: size = available @@ -323,7 +328,6 @@ if self.pos > 0 or size < available: start = self.pos - ret = [] pos = start for i in range(size): pos = next_codepoint_pos(self.text, pos) @@ -336,8 +340,9 @@ chars = self.text self.pos = len(self.text) self.upos = lgt + size = lgt - return chars + return chars, size def has_data(self): return (self.text is not None and not self.exhausted()) @@ -709,8 +714,7 @@ w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) - chars = self.decoded.get_chars(-1) - lgt = get_utf8_length(chars) + chars, lgt = self.decoded.get_chars(-1) w_result = space.newutf8(chars, lgt) w_final = space.add(w_result, w_decoded) self.snapshot = None @@ -723,9 +727,9 @@ while remaining > 0: if not self._ensure_data(space): break - data = self.decoded.get_chars(remaining) - builder.append(data) - remaining -= len(data) + data, size = self.decoded.get_chars(remaining) + builder.append_utf8(data, size) + remaining -= size return space.newutf8(builder.build(), builder.getlength()) @@ -756,6 +760,7 @@ def _readline(self, space, limit): # This is a separate function so that readline_w() can be jitted. remnant = None + remnant_ulen = -1 builder = Utf8StringBuilder() while True: # First, get some data if necessary @@ -763,7 +768,7 @@ if not has_data: # end of file if remnant: - builder.append(remnant) # XXX + builder.append_utf8(remnant, remnant_ulen) break if remnant: @@ -772,11 +777,14 @@ if remnant == '\r' and self.decoded.text[0] == '\n': builder.append_utf8('\r\n', 2) self.decoded.pos = 1 + self.decoded.upos = 1 remnant = None + remnant_ulen = -1 break else: - builder.append(remnant) # XXX + builder.append_utf8(remnant, remnant_ulen) remnant = None + remnant_ulen = -1 continue if limit >= 0: @@ -800,7 +808,7 @@ # There may be some remaining chars we'll have to prepend to the # next chunk of data if not self.decoded.exhausted(): - remnant = self.decoded.get_chars(-1) + remnant, remnant_ulen = self.decoded.get_chars(-1) # We have consumed the buffer self.decoded.reset() diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py --- a/pypy/module/_io/test/test_interp_textio.py +++ b/pypy/module/_io/test/test_interp_textio.py @@ -58,27 +58,31 @@ @given(st.text()) def test_read_buffer(text): - buf = DecodeBuffer(text.encode('utf-8')) - assert buf.get_chars(-1) == text.encode('utf-8') + buf = DecodeBuffer(text.encode('utf8'), len(text)) + chars, size = buf.get_chars(-1) + assert chars.decode('utf8') == text + assert len(text) == size assert buf.exhausted() @given(st.text(), st.lists(st.integers(min_value=0))) @example(u'\x80', [1]) def test_readn_buffer(text, sizes): - buf = DecodeBuffer(text.encode('utf-8')) + buf = DecodeBuffer(text.encode('utf8'), len(text)) strings = [] for n in sizes: - s = buf.get_chars(n) + chars, size = buf.get_chars(n) + s = chars.decode('utf8') + assert size == len(s) if not buf.exhausted(): - assert len(s.decode('utf-8')) == n + assert len(s) == n else: - assert len(s.decode('utf-8')) <= n + assert len(s) <= n strings.append(s) - assert ''.join(strings) == text[:sum(sizes)].encode('utf-8') + assert ''.join(strings) == text[:sum(sizes)] @given(st.text()) def test_next_char(text): - buf = DecodeBuffer(text.encode('utf-8')) + buf = DecodeBuffer(text.encode('utf8'), len(text)) for i in range(len(text)): ch = buf.next_char() assert ch == text[i].encode('utf-8') diff --git a/pypy/module/_io/test/test_textio.py b/pypy/module/_io/test/test_textio.py --- a/pypy/module/_io/test/test_textio.py +++ b/pypy/module/_io/test/test_textio.py @@ -1,3 +1,5 @@ +#encoding: utf-8 + class AppTestTextIO: spaceconfig = dict(usemodules=['_io', '_locale']) @@ -103,6 +105,16 @@ reads += t.readline() assert reads == u"abc\ndef\n" + def test_read_bug_unicode(self): + import _io + inp = b"\xc3\xa4bc\ndef\n" + r = _io.BytesIO(inp) + t = _io.TextIOWrapper(r, encoding="utf-8") + reads = t.read(4) + assert reads == inp[:5].decode("utf-8") + reads += t.readline() + assert reads == inp.decode("utf-8") + def test_encoded_writes(self): import _io data = u"1234567890" _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit