https://github.com/python/cpython/commit/b1b4f9625c5f2a6b2c32bc5ee91c9fef3894b5e6
commit: b1b4f9625c5f2a6b2c32bc5ee91c9fef3894b5e6
branch: main
author: morotti <r.moro...@gmail.com>
committer: gpshead <g...@krypto.org>
date: 2025-03-07T11:36:12-08:00
summary:

gh-117151: IO performance improvement, increase io.DEFAULT_BUFFER_SIZE to 128k 
(GH-118144)


Co-authored-by: rmorotti <romain.moro...@man.com>

files:
A Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst
M Doc/library/functions.rst
M Lib/_pyio.py
M Lib/test/test_file.py
M Modules/_io/_iomodule.c
M Modules/_io/_iomodule.h
M Modules/_io/clinic/_iomodule.c.h

diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst
index a7549b9bce76e2..7e367a0f2b6b25 100644
--- a/Doc/library/functions.rst
+++ b/Doc/library/functions.rst
@@ -1405,10 +1405,10 @@ are always available.  They are listed here in 
alphabetical order.
    :func:`io.TextIOWrapper.reconfigure`. When no *buffering* argument is
    given, the default buffering policy works as follows:
 
-   * Binary files are buffered in fixed-size chunks; the size of the buffer is
-     chosen using a heuristic trying to determine the underlying device's 
"block
-     size" and falling back on :const:`io.DEFAULT_BUFFER_SIZE`.  On many 
systems,
-     the buffer will typically be 4096 or 8192 bytes long.
+   * Binary files are buffered in fixed-size chunks; the size of the buffer
+     is ``max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)``
+     when the device block size is available.
+     On most systems, the buffer will typically be 128 kilobytes long.
 
    * "Interactive" text files (files for which :meth:`~io.IOBase.isatty`
      returns ``True``) use line buffering.  Other text files use the policy
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index 99d6f796de416e..b875103bee441c 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -23,8 +23,9 @@
     valid_seek_flags.add(os.SEEK_HOLE)
     valid_seek_flags.add(os.SEEK_DATA)
 
-# open() uses st_blksize whenever we can
-DEFAULT_BUFFER_SIZE = 8 * 1024  # bytes
+# open() uses max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
+# when the device block size is available.
+DEFAULT_BUFFER_SIZE = 128 * 1024  # bytes
 
 # NOTE: Base classes defined here are registered with the "official" ABCs
 # defined in io.py. We don't use real inheritance though, because we don't want
@@ -123,10 +124,10 @@ def open(file, mode="r", buffering=-1, encoding=None, 
errors=None,
     the size of a fixed-size chunk buffer.  When no buffering argument is
     given, the default buffering policy works as follows:
 
-    * Binary files are buffered in fixed-size chunks; the size of the buffer
-      is chosen using a heuristic trying to determine the underlying device's
-      "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`.
-      On many systems, the buffer will typically be 4096 or 8192 bytes long.
+   * Binary files are buffered in fixed-size chunks; the size of the buffer
+     is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
+     when the device block size is available.
+     On most systems, the buffer will typically be 128 kilobytes long.
 
     * "Interactive" text files (files for which isatty() returns True)
       use line buffering.  Other text files use the policy described above
@@ -242,7 +243,7 @@ def open(file, mode="r", buffering=-1, encoding=None, 
errors=None,
             buffering = -1
             line_buffering = True
         if buffering < 0:
-            buffering = raw._blksize
+            buffering = max(min(raw._blksize, 8192 * 1024), 
DEFAULT_BUFFER_SIZE)
         if buffering < 0:
             raise ValueError("invalid buffering size")
         if buffering == 0:
diff --git a/Lib/test/test_file.py b/Lib/test/test_file.py
index 1206032a93566e..029c903e01afb9 100644
--- a/Lib/test/test_file.py
+++ b/Lib/test/test_file.py
@@ -216,6 +216,16 @@ def testSetBufferSize(self):
         with self.assertWarnsRegex(RuntimeWarning, 'line buffering'):
             self._checkBufferSize(1)
 
+    def testDefaultBufferSize(self):
+        with self.open(TESTFN, 'wb') as f:
+            blksize = f.raw._blksize
+            f.write(b"\0" * 5_000_000)
+
+        with self.open(TESTFN, 'rb') as f:
+            data = f.read1()
+            expected_size = max(min(blksize, 8192 * 1024), 
io.DEFAULT_BUFFER_SIZE)
+            self.assertEqual(len(data), expected_size)
+
     def testTruncateOnWindows(self):
         # SF bug <https://bugs.python.org/issue801631>
         # "file.truncate fault on windows"
diff --git 
a/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst 
b/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst
new file mode 100644
index 00000000000000..6b13debcdccb48
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst
@@ -0,0 +1,5 @@
+Increase ``io.DEFAULT_BUFFER_SIZE`` from 8k to 128k and adjust :func:`open` on
+platforms where :meth:`os.fstat` provides a ``st_blksize`` field (such as 
Linux)
+to use ``max(min(blocksize, 8 MiB), io.DEFAULT_BUFFER_SIZE)`` rather
+than always using the device block size. This should improve I/O performance.
+Patch by Romain Morotti.
diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c
index 6622f2cabb908b..015e9e36cada43 100644
--- a/Modules/_io/_iomodule.c
+++ b/Modules/_io/_iomodule.c
@@ -60,8 +60,7 @@ PyDoc_STRVAR(module_doc,
 "DEFAULT_BUFFER_SIZE\n"
 "\n"
 "   An int containing the default buffer size used by the module's buffered\n"
-"   I/O classes. open() uses the file's blksize (as obtained by os.stat) if\n"
-"   possible.\n"
+"   I/O classes.\n"
     );
 
 
@@ -132,9 +131,9 @@ the size of a fixed-size chunk buffer.  When no buffering 
argument is
 given, the default buffering policy works as follows:
 
 * Binary files are buffered in fixed-size chunks; the size of the buffer
-  is chosen using a heuristic trying to determine the underlying device's
-  "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`.
-  On many systems, the buffer will typically be 4096 or 8192 bytes long.
+ is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
+ when the device block size is available.
+ On most systems, the buffer will typically be 128 kilobytes long.
 
 * "Interactive" text files (files for which isatty() returns True)
   use line buffering.  Other text files use the policy described above
@@ -200,7 +199,7 @@ static PyObject *
 _io_open_impl(PyObject *module, PyObject *file, const char *mode,
               int buffering, const char *encoding, const char *errors,
               const char *newline, int closefd, PyObject *opener)
-/*[clinic end generated code: output=aefafc4ce2b46dc0 input=cd034e7cdfbf4e78]*/
+/*[clinic end generated code: output=aefafc4ce2b46dc0 input=28027fdaabb8d744]*/
 {
     size_t i;
 
@@ -371,6 +370,7 @@ _io_open_impl(PyObject *module, PyObject *file, const char 
*mode,
         Py_DECREF(blksize_obj);
         if (buffering == -1 && PyErr_Occurred())
             goto error;
+        buffering = Py_MAX(Py_MIN(buffering, 8192 * 1024), 
DEFAULT_BUFFER_SIZE);
     }
     if (buffering < 0) {
         PyErr_SetString(PyExc_ValueError,
diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h
index afd638a120ba08..18cf20edf26f7d 100644
--- a/Modules/_io/_iomodule.h
+++ b/Modules/_io/_iomodule.h
@@ -78,7 +78,7 @@ extern Py_ssize_t _PyIO_find_line_ending(
 */
 extern int _PyIO_trap_eintr(void);
 
-#define DEFAULT_BUFFER_SIZE (8 * 1024)  /* bytes */
+#define DEFAULT_BUFFER_SIZE (128 * 1024)  /* bytes */
 
 /*
  * Offset type for positioning.
diff --git a/Modules/_io/clinic/_iomodule.c.h b/Modules/_io/clinic/_iomodule.c.h
index 82932a23331ab6..9a41b364284459 100644
--- a/Modules/_io/clinic/_iomodule.c.h
+++ b/Modules/_io/clinic/_iomodule.c.h
@@ -64,9 +64,9 @@ PyDoc_STRVAR(_io_open__doc__,
 "given, the default buffering policy works as follows:\n"
 "\n"
 "* Binary files are buffered in fixed-size chunks; the size of the buffer\n"
-"  is chosen using a heuristic trying to determine the underlying device\'s\n"
-"  \"block size\" and falling back on `io.DEFAULT_BUFFER_SIZE`.\n"
-"  On many systems, the buffer will typically be 4096 or 8192 bytes long.\n"
+" is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)\n"
+" when the device block size is available.\n"
+" On most systems, the buffer will typically be 128 kilobytes long.\n"
 "\n"
 "* \"Interactive\" text files (files for which isatty() returns True)\n"
 "  use line buffering.  Other text files use the policy described above\n"
@@ -406,4 +406,4 @@ _io_open_code(PyObject *module, PyObject *const *args, 
Py_ssize_t nargs, PyObjec
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=ec1df2ff5265ab16 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=2eaf6e914503bcfd input=a9049054013a1b77]*/

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: arch...@mail-archive.com

Reply via email to