Hi, On 2025-06-05 12:47:52 -0400, Tom Lane wrote: > Andres Freund <and...@anarazel.de> writes: > > I think this is a big enough pitfall that it's, obviously assuming the patch > > has a sensible complexity, worth fixing this in 18. RMT, anyone, what do you > > think? > > Let's see the patch ... but yeah, I'd rather not ship 18 like this.
I've attached a first draft. I can't make heads or tails of the ordering in configure.ac, so the function test is probably in the wrong place. Greetings, Andres
>From aa740a18f2addffadc75defed05777f75dde0a6a Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Thu, 5 Jun 2025 14:10:33 -0400 Subject: [PATCH v1] wip: aio: Combine io_uring memory mappings, if supported Author: Reviewed-by: Discussion: https://postgr.es/m/cafbpf8oa44_ug+ryjcwh9wjf7e3ga6gka3gvh6nsrsnee9h...@mail.gmail.com Backpatch: --- meson.build | 6 + configure.ac | 7 + src/include/pg_config.h.in | 3 + src/backend/storage/aio/method_io_uring.c | 205 +++++++++++++++++++++- configure | 17 ++ src/tools/pgindent/typedefs.list | 1 + 6 files changed, 234 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index d142e3e408b..bc731362fb1 100644 --- a/meson.build +++ b/meson.build @@ -990,6 +990,12 @@ liburingopt = get_option('liburing') liburing = dependency('liburing', required: liburingopt) if liburing.found() cdata.set('USE_LIBURING', 1) + + if cc.has_function('io_uring_queue_init_mem', + dependencies: liburing, args: test_c_args) + cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1) + endif + endif diff --git a/configure.ac b/configure.ac index 4b8335dc613..14f485a453f 100644 --- a/configure.ac +++ b/configure.ac @@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])]) fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + AC_CHECK_FUNCS([io_uring_queue_init_mem]) + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])]) fi diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 726a7c1be1f..c4dc5d72bdb 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -229,6 +229,9 @@ /* Define to 1 if you have the global variable 'int timezone'. */ #undef HAVE_INT_TIMEZONE +/* Define to 1 if you have the `io_uring_queue_init_mem' function. */ +#undef HAVE_IO_URING_QUEUE_INIT_MEM + /* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */ #undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index cc312b641ca..bc7f0104d98 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -29,6 +29,7 @@ #ifdef IOMETHOD_IO_URING_ENABLED +#include <sys/mman.h> #include <liburing.h> #include "miscadmin.h" @@ -94,12 +95,32 @@ PgAioUringContext struct io_uring io_uring_ring; } PgAioUringContext; +/* + * Information about the capabilities that io_uring has. + * + * Depending on liburing and kernel version different features are + * supported. At least for the kernel a kernel version check does not suffice + * as various vendors do backport features to older kernels :(. + */ +typedef struct PgAioUringCaps +{ + bool checked; + /* -1 if io_uring_queue_init_mem() is unsupported */ + int mem_init_size; +} PgAioUringCaps; + + /* PgAioUringContexts for all backends */ static PgAioUringContext *pgaio_uring_contexts; /* the current backend's context */ static PgAioUringContext *pgaio_my_uring_context; +static PgAioUringCaps pgaio_uring_caps = +{ + .checked = false, + .mem_init_size = -1, +}; static uint32 pgaio_uring_procs(void) @@ -111,16 +132,144 @@ pgaio_uring_procs(void) return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; } +/* + * Initializes pgaio_uring_caps unless that's already done + */ +static void +pgaio_uring_check_capabilities(void) +{ + if (pgaio_uring_caps.checked) + return; + + /* + * By default io_uring creates a shared memory mapping for io_uring + * instance, leading to a large number of memory mappings. Unfortunately a + * large number of memory mappings slows things down, backend exit is + * particularly affected. To address that newer kernels (6.5) support + * using user-provided memory for the memory, by putting the relevant + * memory into shared memory we don't need any additional mappings. + * + * To know whether this is supported we unfortunately need to probe the + * kernel by trying to create a ring with userspace-provided memory. This + * also has a secondary benefit: We can check precisely how much memory we + * need for each io_uring instance. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) && 1 + { + struct io_uring test_ring; + size_t ring_size; + void *ring_ptr; + struct io_uring_params p = {0}; + int ret; + + /* + * Liburing does not yet provide an API to query how much memory a + * ring will need. So we over-estimate it here. As the memory is freed + * just below that's small temporary waste of memory. + * + * 1MB is more than enough for rings within io_max_concurrency's + * range. + */ + ring_size = 1024 * 1024; + + /* + * Hard to believe a system exists where 1MB would not be a multiple + * of the page size. But it's cheap to ensure... + */ + ring_size -= ring_size % sysconf(_SC_PAGESIZE); + + ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ring_ptr == MAP_FAILED) + elog(ERROR, + "mmap(%zu) to determine io_uring_queue_init_mem() support has failed: %m", + ring_size); + + ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size); + if (ret > 0) + { + pgaio_uring_caps.mem_init_size = ret; + /* FIXME: This should probably not stay at DEBUG1? */ + elog(DEBUG1, + "can use combined memory mapping for io_uring, each ring needs %d bytes", + ret); + + /* clean up the created ring, it was just for a test */ + io_uring_queue_exit(&test_ring); + } + else + { + /* + * There are different reasons for ring creation to fail, but it's + * ok to treat that just as io_uring_queue_init_mem() not being + * supported. We'll report a more detailed error in + * pgaio_uring_shmem_init(). + */ + errno = -ret; + elog(DEBUG1, + "cannot use combined memory mapping for io_uring, ring creation failed with %m"); + + } + + if (munmap(ring_ptr, ring_size) != 0) + elog(ERROR, "munmap() failed: %m"); + } +#else + { + elog(DEBUG1, + "can't use combined memory mapping for io_uring, kernel or liburing too old"); + } +#endif + + pgaio_uring_caps.checked = true; +} + +/* + * Memory for all PgAioUringContext instances + */ static Size pgaio_uring_context_shmem_size(void) { return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext)); } +/* + * Memory for the combined memory used by io_uring instances. Returns 0 if + * that is not supported by kernel/liburing. + */ +static Size +pgaio_uring_ring_shmem_size(void) +{ + size_t sz = 0; + + if (pgaio_uring_caps.mem_init_size > 0) + { + /* + * Memory for rings needs to be allocated to the page boundary, + * reserve space. Luckily it does not need to be aligned to hugepage + * boundaries, even if huge pages are used. + */ + sz = add_size(sz, sysconf(_SC_PAGESIZE)); + sz = add_size(sz, mul_size(pgaio_uring_procs(), pgaio_uring_caps.mem_init_size)); + } + + return sz; +} + static size_t pgaio_uring_shmem_size(void) { - return pgaio_uring_context_shmem_size(); + size_t sz; + + /* + * Kernel and liburing support for various features influences how much + * shmem we need, perform the necessary checks. + */ + pgaio_uring_check_capabilities(); + + sz = pgaio_uring_context_shmem_size(); + sz = add_size(sz, pgaio_uring_ring_shmem_size()); + + return sz; } static void @@ -128,13 +277,38 @@ pgaio_uring_shmem_init(bool first_time) { int TotalProcs = pgaio_uring_procs(); bool found; + char *shmem; + size_t ring_mem_remain = 0; + char *ring_mem_next = 0; - pgaio_uring_contexts = (PgAioUringContext *) - ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found); - + /* + * XXX: We allocate memory for all PgAioUringContext instances and, if + * supported, the memory required for each of the io_uring instances, in + * one ShmemInitStruct(). Should we instead separate the latter into a + * separate ShmemInitStruct()? + */ + shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found); if (found) return; + pgaio_uring_contexts = (PgAioUringContext *) shmem; + shmem += pgaio_uring_context_shmem_size(); + + if (pgaio_uring_caps.mem_init_size > 0) + { + ring_mem_remain = pgaio_uring_ring_shmem_size(); + ring_mem_next = (char *) shmem; + + /* align to page boundary, see also pgaio_uring_ring_shmem_size() */ + ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next); + + /* account for alignment */ + ring_mem_remain -= ring_mem_next - shmem; + shmem += ring_mem_next - shmem; + + shmem += ring_mem_remain; + } + for (int contextno = 0; contextno < TotalProcs; contextno++) { PgAioUringContext *context = &pgaio_uring_contexts[contextno]; @@ -158,7 +332,28 @@ pgaio_uring_shmem_init(bool first_time) * be worth using that - also need to evaluate if that causes * noticeable additional contention? */ - ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + + /* + * If supported (c.f. pgaio_uring_check_capabilities()), create ring + * with its data in shared memory. Otherwise fall back io_uring + * creating a memory mapping for each ring. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + if (pgaio_uring_caps.mem_init_size > 0) + { + struct io_uring_params p = {0}; + + ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain); + + ring_mem_remain -= ret; + ring_mem_next += ret; + } + else +#endif + { + ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + } + if (ret < 0) { char *hint = NULL; diff --git a/configure b/configure index 4f15347cc95..5ae8edee079 100755 --- a/configure +++ b/configure @@ -13309,6 +13309,23 @@ fi fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + for ac_func in io_uring_queue_init_mem +do : + ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem" +if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_IO_URING_QUEUE_INIT_MEM 1 +_ACEOF + +fi +done + + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5 $as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a8346cda633..83299ccc427 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2171,6 +2171,7 @@ PgAioReturn PgAioTargetData PgAioTargetID PgAioTargetInfo +PgAioUringCaps PgAioUringContext PgAioWaitRef PgArchData -- 2.48.1.76.g4e746b1a31.dirty