On Wed, Jun 14, 2023 at 12:40 PM John Naylor <john.nay...@enterprisedb.com>
wrote:
>
> On Sat, Nov 5, 2022 at 3:27 PM Andres Freund <and...@anarazel.de> wrote:

> > A real version would have to open /proc/self/maps and do this for at
least
> > postgres' r-xp mapping. We could do it for libraries too, if they're
suitably
> > aligned (both in memory and on-disk).

> For the postmaster, it should be simple to have a function that just
takes the address of itself, then parses /proc/self/maps to find the
boundaries within which it lies. I haven't thought about libraries much.
Though with just the postmaster it seems that would give us the biggest
bang for the buck?

Here's a start at that, trying with postmaster only. Unfortunately, I get
"MADV_COLLAPSE failed: Invalid argument". I tried different addresses with
no luck, and also got the same result with a small standalone program. I'm
on ext4, so I gather I don't need "cp --reflink=never" but tried it anyway.
Configuration looks normal by "grep HUGEPAGE /boot/config-$(uname
-r)".  Maybe there's something obvious I'm missing?

--
John Naylor
EDB: http://www.enterprisedb.com
From ca38a370e866d27c8b51c83f8f18bdda1587b3df Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Mon, 31 Oct 2022 15:24:29 +0700
Subject: [PATCH v2 2/2] Attmept to remap the .text segment into huge pages at
 postmaster start

Use MADV_COLLAPSE advice, available since Linux kernel 6.1.

Andres Freund and John Naylor
---
 src/backend/port/huge_page.c        | 113 ++++++++++++++++++++++++++++
 src/backend/port/meson.build        |   4 +
 src/backend/postmaster/postmaster.c |   7 ++
 src/include/port/huge_page.h        |  18 +++++
 4 files changed, 142 insertions(+)
 create mode 100644 src/backend/port/huge_page.c
 create mode 100644 src/include/port/huge_page.h

diff --git a/src/backend/port/huge_page.c b/src/backend/port/huge_page.c
new file mode 100644
index 0000000000..92f87bb3c2
--- /dev/null
+++ b/src/backend/port/huge_page.c
@@ -0,0 +1,113 @@
+/*-------------------------------------------------------------------------
+ *
+ * huge_page.c
+ *	  Map .text segment of binary to huge pages
+ *
+ * TODO: better rationale for separate file if the huge page handling
+ * in sysv_shmem.c were moved here.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *	  src/backend/port/huge_page.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/mman.h>
+
+#include "port/huge_page.h"
+#include "storage/fd.h"
+
+/*
+ * Collapse specified memory range to huge pages.
+ */
+static void
+CollapseRegionToHugePages(void *addr, size_t advlen)
+{
+#ifdef __linux__
+	size_t advlen_up;
+	int r;
+	void *r2;
+	const size_t bound = 1024*1024*2; // FIXME: x86
+
+	fprintf(stderr, "old advlen: %lx\n", advlen);
+	advlen_up = (advlen + bound - 1) & ~(bound - 1);
+
+	/*
+	* Increase size of mapping to cover the tailing padding to the next
+	* segment. Otherwise all the code in that range can't be put into
+	* a huge page (access in the non-mapped range needs to cause a fault,
+	* hence can't be in the huge page).
+	* XXX: Should proably assert that that space is actually zeroes.
+	*/
+	r2 = mremap(addr, advlen, advlen_up, 0);
+	if (r2 == MAP_FAILED)
+		fprintf(stderr, "mremap failed: %m\n");
+	else if (r2 != addr)
+		fprintf(stderr, "mremap wrong addr: %m\n");
+	else
+		advlen = advlen_up;
+
+	fprintf(stderr, "new advlen: %lx\n", advlen);
+
+	/*
+	* The docs for MADV_COLLAPSE say there should be at least one page
+	* in the mapped space "for every eligible hugepage-aligned/sized
+	* region to be collapsed". I just forced that. But probably not
+	* necessary.
+	*/
+	r = madvise(addr, advlen, MADV_WILLNEED);
+	if (r != 0)
+		fprintf(stderr, "MADV_WILLNEED failed: %m\n");
+
+	r = madvise(addr, advlen, MADV_POPULATE_READ);
+	if (r != 0)
+		fprintf(stderr, "MADV_POPULATE_READ failed: %m\n");
+
+	/*
+	* Make huge pages out of it. Requires at least linux 6.1.  We could
+	* fall back to MADV_HUGEPAGE if it fails, but it doesn't do all that
+	* much in older kernels.
+	*/
+	r = madvise(addr, advlen, MADV_COLLAPSE);
+	if (r != 0)
+	{
+		fprintf(stderr, "MADV_COLLAPSE failed: %m\n");
+
+		r = madvise(addr, advlen, MADV_HUGEPAGE);
+		if (r != 0)
+			fprintf(stderr, "MADV_HUGEPAGE failed: %m\n");
+	}
+#endif
+}
+
+/*  Map the postgres .text segment into huge pages. */
+void
+MapStaticCodeToLargePages(void)
+{
+#ifdef __linux__
+	FILE	   *fp = AllocateFile("/proc/self/maps", "r");
+	char		buf[128]; // got this from code reading /proc/meminfo -- enough?
+	uintptr_t 	addr;
+	uintptr_t 	end;
+	void * 		self = &MapStaticCodeToLargePages;
+
+	if (fp)
+	{
+		while (fgets(buf, sizeof(buf), fp))
+		{
+			if (sscanf(buf, "%lx-%lx", &addr, &end) == 2 &&
+				addr <= (uintptr_t) self && (uintptr_t) self < end)
+			{
+				fprintf(stderr, "self: %p start: %lx end: %lx\n", self, addr, end);
+				CollapseRegionToHugePages((void *) addr, end - addr);
+				break;
+			}
+		}
+		FreeFile(fp);
+	}
+#endif
+}
diff --git a/src/backend/port/meson.build b/src/backend/port/meson.build
index 8fa68a88aa..af4d0c7bb7 100644
--- a/src/backend/port/meson.build
+++ b/src/backend/port/meson.build
@@ -25,6 +25,10 @@ if cdata.has('USE_WIN32_SHARED_MEMORY')
   backend_sources += files('win32_shmem.c')
 endif
 
+if host_system == 'linux'
+  backend_sources += files('huge_page.c')
+endif
+
 if host_system == 'windows'
   subdir('win32')
 endif
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 4c49393fc5..216e8c5730 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -106,6 +106,7 @@
 #include "pg_getopt.h"
 #include "pgstat.h"
 #include "port/pg_bswap.h"
+#include "port/huge_page.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/auxprocess.h"
 #include "postmaster/bgworker_internals.h"
@@ -1007,6 +1008,12 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	process_shared_preload_libraries();
 
+	/*
+	 * Try to map the binary code to huge pages. We do this just after
+	 * any shared libraries are preloaded for future-proofing.
+	 */
+	MapStaticCodeToLargePages();
+
 	/*
 	 * Initialize SSL library, if specified.
 	 */
diff --git a/src/include/port/huge_page.h b/src/include/port/huge_page.h
new file mode 100644
index 0000000000..171819dd53
--- /dev/null
+++ b/src/include/port/huge_page.h
@@ -0,0 +1,18 @@
+/*-------------------------------------------------------------------------
+ *
+ * large_page.h
+ *	  Map .text segment of binary to huge pages
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *	  src/include/port/large_page.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LARGE_PAGE_H
+#define LARGE_PAGE_H
+
+extern void MapStaticCodeToLargePages(void);
+
+#endif							/* LARGE_PAGE_H */
-- 
2.40.1

From e8a0c8633e969ad45eef82b40460df6552e6e550 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 2 Dec 2022 14:58:21 +0700
Subject: [PATCH v2 1/2] Align loadable segments to 2MB boundaries on Linux

Prerequsite for using huge pages for the .text section
on that platform.

TODO: autoconf support

Andres Freund and John Naylor
---
 meson.build | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/meson.build b/meson.build
index 16b2e86646..32af8bf5c3 100644
--- a/meson.build
+++ b/meson.build
@@ -248,6 +248,13 @@ elif host_system == 'freebsd'
 elif host_system == 'linux'
   sema_kind = 'unnamed_posix'
   cppflags += '-D_GNU_SOURCE'
+  # Align the loadable segments to 2MB boundaries to support remapping to
+  # huge pages.
+  ldflags += cc.get_supported_link_arguments([
+    '-Wl,-zmax-page-size=0x200000',
+    '-Wl,-zcommon-page-size=0x200000',
+    '-Wl,-zseparate-loadable-segments'
+  ])
 
 elif host_system == 'netbsd'
   # We must resolve all dynamic linking in the core server at program start.
-- 
2.40.1

Reply via email to