On Wed, Jun 14, 2023 at 12:40 PM John Naylor <john.nay...@enterprisedb.com> wrote: > > On Sat, Nov 5, 2022 at 3:27 PM Andres Freund <and...@anarazel.de> wrote:
> > A real version would have to open /proc/self/maps and do this for at least > > postgres' r-xp mapping. We could do it for libraries too, if they're suitably > > aligned (both in memory and on-disk). > For the postmaster, it should be simple to have a function that just takes the address of itself, then parses /proc/self/maps to find the boundaries within which it lies. I haven't thought about libraries much. Though with just the postmaster it seems that would give us the biggest bang for the buck? Here's a start at that, trying with postmaster only. Unfortunately, I get "MADV_COLLAPSE failed: Invalid argument". I tried different addresses with no luck, and also got the same result with a small standalone program. I'm on ext4, so I gather I don't need "cp --reflink=never" but tried it anyway. Configuration looks normal by "grep HUGEPAGE /boot/config-$(uname -r)". Maybe there's something obvious I'm missing? -- John Naylor EDB: http://www.enterprisedb.com
From ca38a370e866d27c8b51c83f8f18bdda1587b3df Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Mon, 31 Oct 2022 15:24:29 +0700 Subject: [PATCH v2 2/2] Attmept to remap the .text segment into huge pages at postmaster start Use MADV_COLLAPSE advice, available since Linux kernel 6.1. Andres Freund and John Naylor --- src/backend/port/huge_page.c | 113 ++++++++++++++++++++++++++++ src/backend/port/meson.build | 4 + src/backend/postmaster/postmaster.c | 7 ++ src/include/port/huge_page.h | 18 +++++ 4 files changed, 142 insertions(+) create mode 100644 src/backend/port/huge_page.c create mode 100644 src/include/port/huge_page.h diff --git a/src/backend/port/huge_page.c b/src/backend/port/huge_page.c new file mode 100644 index 0000000000..92f87bb3c2 --- /dev/null +++ b/src/backend/port/huge_page.c @@ -0,0 +1,113 @@ +/*------------------------------------------------------------------------- + * + * huge_page.c + * Map .text segment of binary to huge pages + * + * TODO: better rationale for separate file if the huge page handling + * in sysv_shmem.c were moved here. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/port/huge_page.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <sys/mman.h> + +#include "port/huge_page.h" +#include "storage/fd.h" + +/* + * Collapse specified memory range to huge pages. + */ +static void +CollapseRegionToHugePages(void *addr, size_t advlen) +{ +#ifdef __linux__ + size_t advlen_up; + int r; + void *r2; + const size_t bound = 1024*1024*2; // FIXME: x86 + + fprintf(stderr, "old advlen: %lx\n", advlen); + advlen_up = (advlen + bound - 1) & ~(bound - 1); + + /* + * Increase size of mapping to cover the tailing padding to the next + * segment. Otherwise all the code in that range can't be put into + * a huge page (access in the non-mapped range needs to cause a fault, + * hence can't be in the huge page). + * XXX: Should proably assert that that space is actually zeroes. + */ + r2 = mremap(addr, advlen, advlen_up, 0); + if (r2 == MAP_FAILED) + fprintf(stderr, "mremap failed: %m\n"); + else if (r2 != addr) + fprintf(stderr, "mremap wrong addr: %m\n"); + else + advlen = advlen_up; + + fprintf(stderr, "new advlen: %lx\n", advlen); + + /* + * The docs for MADV_COLLAPSE say there should be at least one page + * in the mapped space "for every eligible hugepage-aligned/sized + * region to be collapsed". I just forced that. But probably not + * necessary. + */ + r = madvise(addr, advlen, MADV_WILLNEED); + if (r != 0) + fprintf(stderr, "MADV_WILLNEED failed: %m\n"); + + r = madvise(addr, advlen, MADV_POPULATE_READ); + if (r != 0) + fprintf(stderr, "MADV_POPULATE_READ failed: %m\n"); + + /* + * Make huge pages out of it. Requires at least linux 6.1. We could + * fall back to MADV_HUGEPAGE if it fails, but it doesn't do all that + * much in older kernels. + */ + r = madvise(addr, advlen, MADV_COLLAPSE); + if (r != 0) + { + fprintf(stderr, "MADV_COLLAPSE failed: %m\n"); + + r = madvise(addr, advlen, MADV_HUGEPAGE); + if (r != 0) + fprintf(stderr, "MADV_HUGEPAGE failed: %m\n"); + } +#endif +} + +/* Map the postgres .text segment into huge pages. */ +void +MapStaticCodeToLargePages(void) +{ +#ifdef __linux__ + FILE *fp = AllocateFile("/proc/self/maps", "r"); + char buf[128]; // got this from code reading /proc/meminfo -- enough? + uintptr_t addr; + uintptr_t end; + void * self = &MapStaticCodeToLargePages; + + if (fp) + { + while (fgets(buf, sizeof(buf), fp)) + { + if (sscanf(buf, "%lx-%lx", &addr, &end) == 2 && + addr <= (uintptr_t) self && (uintptr_t) self < end) + { + fprintf(stderr, "self: %p start: %lx end: %lx\n", self, addr, end); + CollapseRegionToHugePages((void *) addr, end - addr); + break; + } + } + FreeFile(fp); + } +#endif +} diff --git a/src/backend/port/meson.build b/src/backend/port/meson.build index 8fa68a88aa..af4d0c7bb7 100644 --- a/src/backend/port/meson.build +++ b/src/backend/port/meson.build @@ -25,6 +25,10 @@ if cdata.has('USE_WIN32_SHARED_MEMORY') backend_sources += files('win32_shmem.c') endif +if host_system == 'linux' + backend_sources += files('huge_page.c') +endif + if host_system == 'windows' subdir('win32') endif diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 4c49393fc5..216e8c5730 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -106,6 +106,7 @@ #include "pg_getopt.h" #include "pgstat.h" #include "port/pg_bswap.h" +#include "port/huge_page.h" #include "postmaster/autovacuum.h" #include "postmaster/auxprocess.h" #include "postmaster/bgworker_internals.h" @@ -1007,6 +1008,12 @@ PostmasterMain(int argc, char *argv[]) */ process_shared_preload_libraries(); + /* + * Try to map the binary code to huge pages. We do this just after + * any shared libraries are preloaded for future-proofing. + */ + MapStaticCodeToLargePages(); + /* * Initialize SSL library, if specified. */ diff --git a/src/include/port/huge_page.h b/src/include/port/huge_page.h new file mode 100644 index 0000000000..171819dd53 --- /dev/null +++ b/src/include/port/huge_page.h @@ -0,0 +1,18 @@ +/*------------------------------------------------------------------------- + * + * large_page.h + * Map .text segment of binary to huge pages + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/large_page.h + * + *------------------------------------------------------------------------- + */ +#ifndef LARGE_PAGE_H +#define LARGE_PAGE_H + +extern void MapStaticCodeToLargePages(void); + +#endif /* LARGE_PAGE_H */ -- 2.40.1
From e8a0c8633e969ad45eef82b40460df6552e6e550 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 2 Dec 2022 14:58:21 +0700 Subject: [PATCH v2 1/2] Align loadable segments to 2MB boundaries on Linux Prerequsite for using huge pages for the .text section on that platform. TODO: autoconf support Andres Freund and John Naylor --- meson.build | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/meson.build b/meson.build index 16b2e86646..32af8bf5c3 100644 --- a/meson.build +++ b/meson.build @@ -248,6 +248,13 @@ elif host_system == 'freebsd' elif host_system == 'linux' sema_kind = 'unnamed_posix' cppflags += '-D_GNU_SOURCE' + # Align the loadable segments to 2MB boundaries to support remapping to + # huge pages. + ldflags += cc.get_supported_link_arguments([ + '-Wl,-zmax-page-size=0x200000', + '-Wl,-zcommon-page-size=0x200000', + '-Wl,-zseparate-loadable-segments' + ]) elif host_system == 'netbsd' # We must resolve all dynamic linking in the core server at program start. -- 2.40.1