https://github.com/python/cpython/commit/b1b4f9625c5f2a6b2c32bc5ee91c9fef3894b5e6 commit: b1b4f9625c5f2a6b2c32bc5ee91c9fef3894b5e6 branch: main author: morotti <r.moro...@gmail.com> committer: gpshead <g...@krypto.org> date: 2025-03-07T11:36:12-08:00 summary:
gh-117151: IO performance improvement, increase io.DEFAULT_BUFFER_SIZE to 128k (GH-118144) Co-authored-by: rmorotti <romain.moro...@man.com> files: A Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst M Doc/library/functions.rst M Lib/_pyio.py M Lib/test/test_file.py M Modules/_io/_iomodule.c M Modules/_io/_iomodule.h M Modules/_io/clinic/_iomodule.c.h diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst index a7549b9bce76e2..7e367a0f2b6b25 100644 --- a/Doc/library/functions.rst +++ b/Doc/library/functions.rst @@ -1405,10 +1405,10 @@ are always available. They are listed here in alphabetical order. :func:`io.TextIOWrapper.reconfigure`. When no *buffering* argument is given, the default buffering policy works as follows: - * Binary files are buffered in fixed-size chunks; the size of the buffer is - chosen using a heuristic trying to determine the underlying device's "block - size" and falling back on :const:`io.DEFAULT_BUFFER_SIZE`. On many systems, - the buffer will typically be 4096 or 8192 bytes long. + * Binary files are buffered in fixed-size chunks; the size of the buffer + is ``max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)`` + when the device block size is available. + On most systems, the buffer will typically be 128 kilobytes long. * "Interactive" text files (files for which :meth:`~io.IOBase.isatty` returns ``True``) use line buffering. Other text files use the policy diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 99d6f796de416e..b875103bee441c 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -23,8 +23,9 @@ valid_seek_flags.add(os.SEEK_HOLE) valid_seek_flags.add(os.SEEK_DATA) -# open() uses st_blksize whenever we can -DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes +# open() uses max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE) +# when the device block size is available. +DEFAULT_BUFFER_SIZE = 128 * 1024 # bytes # NOTE: Base classes defined here are registered with the "official" ABCs # defined in io.py. We don't use real inheritance though, because we don't want @@ -123,10 +124,10 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, the size of a fixed-size chunk buffer. When no buffering argument is given, the default buffering policy works as follows: - * Binary files are buffered in fixed-size chunks; the size of the buffer - is chosen using a heuristic trying to determine the underlying device's - "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`. - On many systems, the buffer will typically be 4096 or 8192 bytes long. + * Binary files are buffered in fixed-size chunks; the size of the buffer + is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE) + when the device block size is available. + On most systems, the buffer will typically be 128 kilobytes long. * "Interactive" text files (files for which isatty() returns True) use line buffering. Other text files use the policy described above @@ -242,7 +243,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, buffering = -1 line_buffering = True if buffering < 0: - buffering = raw._blksize + buffering = max(min(raw._blksize, 8192 * 1024), DEFAULT_BUFFER_SIZE) if buffering < 0: raise ValueError("invalid buffering size") if buffering == 0: diff --git a/Lib/test/test_file.py b/Lib/test/test_file.py index 1206032a93566e..029c903e01afb9 100644 --- a/Lib/test/test_file.py +++ b/Lib/test/test_file.py @@ -216,6 +216,16 @@ def testSetBufferSize(self): with self.assertWarnsRegex(RuntimeWarning, 'line buffering'): self._checkBufferSize(1) + def testDefaultBufferSize(self): + with self.open(TESTFN, 'wb') as f: + blksize = f.raw._blksize + f.write(b"\0" * 5_000_000) + + with self.open(TESTFN, 'rb') as f: + data = f.read1() + expected_size = max(min(blksize, 8192 * 1024), io.DEFAULT_BUFFER_SIZE) + self.assertEqual(len(data), expected_size) + def testTruncateOnWindows(self): # SF bug <https://bugs.python.org/issue801631> # "file.truncate fault on windows" diff --git a/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst b/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst new file mode 100644 index 00000000000000..6b13debcdccb48 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst @@ -0,0 +1,5 @@ +Increase ``io.DEFAULT_BUFFER_SIZE`` from 8k to 128k and adjust :func:`open` on +platforms where :meth:`os.fstat` provides a ``st_blksize`` field (such as Linux) +to use ``max(min(blocksize, 8 MiB), io.DEFAULT_BUFFER_SIZE)`` rather +than always using the device block size. This should improve I/O performance. +Patch by Romain Morotti. diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c index 6622f2cabb908b..015e9e36cada43 100644 --- a/Modules/_io/_iomodule.c +++ b/Modules/_io/_iomodule.c @@ -60,8 +60,7 @@ PyDoc_STRVAR(module_doc, "DEFAULT_BUFFER_SIZE\n" "\n" " An int containing the default buffer size used by the module's buffered\n" -" I/O classes. open() uses the file's blksize (as obtained by os.stat) if\n" -" possible.\n" +" I/O classes.\n" ); @@ -132,9 +131,9 @@ the size of a fixed-size chunk buffer. When no buffering argument is given, the default buffering policy works as follows: * Binary files are buffered in fixed-size chunks; the size of the buffer - is chosen using a heuristic trying to determine the underlying device's - "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`. - On many systems, the buffer will typically be 4096 or 8192 bytes long. + is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE) + when the device block size is available. + On most systems, the buffer will typically be 128 kilobytes long. * "Interactive" text files (files for which isatty() returns True) use line buffering. Other text files use the policy described above @@ -200,7 +199,7 @@ static PyObject * _io_open_impl(PyObject *module, PyObject *file, const char *mode, int buffering, const char *encoding, const char *errors, const char *newline, int closefd, PyObject *opener) -/*[clinic end generated code: output=aefafc4ce2b46dc0 input=cd034e7cdfbf4e78]*/ +/*[clinic end generated code: output=aefafc4ce2b46dc0 input=28027fdaabb8d744]*/ { size_t i; @@ -371,6 +370,7 @@ _io_open_impl(PyObject *module, PyObject *file, const char *mode, Py_DECREF(blksize_obj); if (buffering == -1 && PyErr_Occurred()) goto error; + buffering = Py_MAX(Py_MIN(buffering, 8192 * 1024), DEFAULT_BUFFER_SIZE); } if (buffering < 0) { PyErr_SetString(PyExc_ValueError, diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h index afd638a120ba08..18cf20edf26f7d 100644 --- a/Modules/_io/_iomodule.h +++ b/Modules/_io/_iomodule.h @@ -78,7 +78,7 @@ extern Py_ssize_t _PyIO_find_line_ending( */ extern int _PyIO_trap_eintr(void); -#define DEFAULT_BUFFER_SIZE (8 * 1024) /* bytes */ +#define DEFAULT_BUFFER_SIZE (128 * 1024) /* bytes */ /* * Offset type for positioning. diff --git a/Modules/_io/clinic/_iomodule.c.h b/Modules/_io/clinic/_iomodule.c.h index 82932a23331ab6..9a41b364284459 100644 --- a/Modules/_io/clinic/_iomodule.c.h +++ b/Modules/_io/clinic/_iomodule.c.h @@ -64,9 +64,9 @@ PyDoc_STRVAR(_io_open__doc__, "given, the default buffering policy works as follows:\n" "\n" "* Binary files are buffered in fixed-size chunks; the size of the buffer\n" -" is chosen using a heuristic trying to determine the underlying device\'s\n" -" \"block size\" and falling back on `io.DEFAULT_BUFFER_SIZE`.\n" -" On many systems, the buffer will typically be 4096 or 8192 bytes long.\n" +" is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)\n" +" when the device block size is available.\n" +" On most systems, the buffer will typically be 128 kilobytes long.\n" "\n" "* \"Interactive\" text files (files for which isatty() returns True)\n" " use line buffering. Other text files use the policy described above\n" @@ -406,4 +406,4 @@ _io_open_code(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec exit: return return_value; } -/*[clinic end generated code: output=ec1df2ff5265ab16 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=2eaf6e914503bcfd input=a9049054013a1b77]*/ _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com