https://github.com/python/cpython/commit/a3d5aab9a89e311cded9c724ce7d5a873e4d680d commit: a3d5aab9a89e311cded9c724ce7d5a873e4d680d branch: main author: Cody Maloney <cmalo...@users.noreply.github.com> committer: vstinner <vstin...@python.org> date: 2025-02-07T12:06:11+01:00 summary:
gh-129005: Align FileIO.readall between _pyio and _io (#129705) Utilize `bytearray.resize()` and `os.readinto()` to reduce copies and match behavior of `_io.FileIO.readall()`. There is still an extra copy which means twice the memory required compared to FileIO because there isn't a zero-copy path from `bytearray` -> `bytes` currently. On my system reading a 2 GB file: `./python -m test -M8g -uall test_largefile -m test.test_largefile.PyLargeFileTest.test_large_read -v` Goes from ~2.7 seconds -> ~2.2 seconds Co-authored-by: Victor Stinner <vstin...@python.org> files: A Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst M Lib/_pyio.py diff --git a/Lib/_pyio.py b/Lib/_pyio.py index b3a8f37d68acdb..f7370dff19efc8 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1454,6 +1454,17 @@ def write(self, b): return BufferedWriter.write(self, b) +def _new_buffersize(bytes_read): + # Parallels _io/fileio.c new_buffersize + if bytes_read > 65536: + addend = bytes_read >> 3 + else: + addend = 256 + bytes_read + if addend < DEFAULT_BUFFER_SIZE: + addend = DEFAULT_BUFFER_SIZE + return bytes_read + addend + + class FileIO(RawIOBase): _fd = -1 _created = False @@ -1672,22 +1683,20 @@ def readall(self): except OSError: pass - result = bytearray() - while True: - if len(result) >= bufsize: - bufsize = len(result) - bufsize += max(bufsize, DEFAULT_BUFFER_SIZE) - n = bufsize - len(result) - try: - chunk = os.read(self._fd, n) - except BlockingIOError: - if result: - break + result = bytearray(bufsize) + bytes_read = 0 + try: + while n := os.readinto(self._fd, memoryview(result)[bytes_read:]): + bytes_read += n + if bytes_read >= len(result): + result.resize(_new_buffersize(bytes_read)) + except BlockingIOError: + if not bytes_read: return None - if not chunk: # reached the end of the file - break - result += chunk + assert len(result) - bytes_read >= 1, \ + "os.readinto buffer size 0 will result in erroneous EOF / returns 0" + result.resize(bytes_read) return bytes(result) def readinto(self, buffer): diff --git a/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst new file mode 100644 index 00000000000000..236d7766b92cc6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst @@ -0,0 +1,2 @@ +``_pyio.FileIO.readall()`` now allocates, resizes, and fills a data buffer +using the same algorithm ``_io.FileIO.readall()`` uses. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com