Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: py3.6 Changeset: r97436:7c591df76a01 Date: 2019-09-11 13:42 +0200 http://bitbucket.org/pypy/pypy/changeset/7c591df76a01/
Log: more improvement to the performance of _io: make get_chars track the number of unicode codepoints. also fix a bug in W_TextIOWrapper._read that assumed ascii diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -308,14 +308,18 @@ class DecodeBuffer(object): - def __init__(self, text=None): + def __init__(self, text=None, ulen=-1): # self.text is a valid utf-8 string - self.text = None + if text is not None: + assert ulen >= 0 + self.text = text self.pos = 0 self.upos = 0 + self.ulen = ulen def set(self, space, w_decoded): check_decoded(space, w_decoded) + self.ulen = space.len_w(w_decoded) self.text = space.utf8_w(w_decoded) self.pos = 0 self.upos = 0 @@ -324,13 +328,14 @@ self.text = None self.pos = 0 self.upos = 0 + self.ulen = -1 def get_chars(self, size): """ returns a tuple (utf8, lgt) """ if self.text is None or size == 0: - return "" + return "", 0 - lgt = codepoints_in_utf8(self.text) + lgt = self.ulen available = lgt - self.upos if size < 0 or size > available: size = available @@ -350,8 +355,9 @@ chars = self.text self.pos = len(self.text) self.upos = lgt + size = lgt - return chars + return chars, size def has_data(self): return (self.text is not None and not self.exhausted()) @@ -758,7 +764,7 @@ w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) - w_result = space.newtext(self.decoded.get_chars(-1)) + w_result = space.newutf8(*self.decoded.get_chars(-1)) w_final = space.add(w_result, w_decoded) self.decoded.reset() self.snapshot = None @@ -766,15 +772,15 @@ def _read(self, space, size): remaining = size - builder = StringBuilder(size) + builder = Utf8StringBuilder(size) # Keep reading chunks until we have n characters to return while remaining > 0: if not self._ensure_data(space): break - data = self.decoded.get_chars(remaining) - builder.append(data) - remaining -= len(data) + data, size = self.decoded.get_chars(remaining) + builder.append_utf8(data, size) + remaining -= size return space.newutf8(builder.build(), builder.getlength()) @@ -804,6 +810,7 @@ def _readline(self, space, limit): # This is a separate function so that readline_w() can be jitted. remnant = None + remnant_ulen = -1 builder = Utf8StringBuilder() while True: # First, get some data if necessary @@ -811,7 +818,7 @@ if not has_data: # end of file if remnant: - builder.append(remnant) # XXX + builder.append_utf8(remnant, remnant_ulen) break if remnant: @@ -820,11 +827,14 @@ if remnant == '\r' and self.decoded.text[0] == '\n': builder.append_utf8('\r\n', 2) self.decoded.pos = 1 + self.decoded.upos = 1 remnant = None + remnant_ulen = -1 break else: - builder.append(remnant) # XXX + builder.append_utf8(remnant, remnant_ulen) remnant = None + remnant_ulen = -1 continue if limit >= 0: @@ -848,7 +858,7 @@ # There may be some remaining chars we'll have to prepend to the # next chunk of data if not self.decoded.exhausted(): - remnant = self.decoded.get_chars(-1) + remnant, remnant_ulen = self.decoded.get_chars(-1) # We have consumed the buffer self.decoded.reset() diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py --- a/pypy/module/_io/test/test_interp_textio.py +++ b/pypy/module/_io/test/test_interp_textio.py @@ -58,17 +58,21 @@ @given(st.text()) def test_read_buffer(text): - buf = DecodeBuffer(text.encode('utf8')) - assert buf.get_chars(-1).decode('utf8') == text + buf = DecodeBuffer(text.encode('utf8'), len(text)) + chars, size = buf.get_chars(-1) + assert chars.decode('utf8') == text + assert len(text) == size assert buf.exhausted() @given(st.text(), st.lists(st.integers(min_value=0))) @example(u'\x80', [1]) def test_readn_buffer(text, sizes): - buf = DecodeBuffer(text.encode('utf8')) + buf = DecodeBuffer(text.encode('utf8'), len(text)) strings = [] for n in sizes: - s = buf.get_chars(n).decode('utf8') + chars, size = buf.get_chars(n) + s = chars.decode('utf8') + assert size == len(s) if not buf.exhausted(): assert len(s) == n else: @@ -79,7 +83,7 @@ @given(st.text()) @example(u'\x800') def test_next_char(text): - buf = DecodeBuffer(text.encode('utf8')) + buf = DecodeBuffer(text.encode('utf8'), len(text)) chars = [] try: while True: diff --git a/pypy/module/_io/test/test_textio.py b/pypy/module/_io/test/test_textio.py --- a/pypy/module/_io/test/test_textio.py +++ b/pypy/module/_io/test/test_textio.py @@ -1,3 +1,5 @@ +#encoding: utf-8 + class AppTestTextIO: spaceconfig = dict(usemodules=['_io', '_locale', 'array']) @@ -141,6 +143,15 @@ reads += t.readline() assert reads == "abc\ndef\n" + def test_read_bug_unicode(self): + import _io + r = _io.BytesIO(b"\xc3\xa4bc\ndef\n") + t = _io.TextIOWrapper(r, encoding="utf-8") + reads = t.read(4) + assert reads == "äbc\n" + reads += t.readline() + assert reads == "äbc\ndef\n" + def test_encoded_writes(self): import _io data = "1234567890" _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit