Hi!

While playing with the memory context internals some time ago, I've been
wondering if there are better ways to allocate memory from the kernel -
either tweaking libc somehow, or maybe interacting with kernel directly.

I mostly forgot about that topic, but after the local conference last
week we went to a pub and one of the things we discussed over a beer was
how complex and unintuitive the memory stuff is, because of the libc
heuristics, 'sbrk' properties [1] and behavior in the presence of holes,
OOM etc.

The virtual memory system should handle this to a large degree, but I've
repeatedly ran into problem when that was not the case (for example the
memory is still part of the virtual address space, and thus counted by OOM).

One of the wilder ideas (I mentined beer was involved!) was a memory
allocator based on mmap [2], bypassing the libc malloc implementation
altogether. mmap() has some nice features (e.g. no issues with returning
memory back to the kernel, which may be problem with sbrk). So I hacked
a bit and switched the AllocSet implementation to mmap().

And it works surprisingly well, so here is an experimental patch for
comments whether this really is a good idea etc. Some parts of the patch
are a bit dirty and I've only tested it on x86.


Notes
-----

(1) The main changes are mostly trivial, rewriting malloc()/free() to
    mmap()/munmap(), plus related chages (e.g. mmap() returns (void*)-1
    instead of NULL in case of failure).

(2) A significant difference is that mmap() can't allocate blocks
    smaller than page size, which is 4kB on x86. This means the
    smallest context is 4kB (instead of 1kB), and also affects the
    growth of block size (it can only grow to 8kB). So this changes
    the AllocSet internal behavior a bit.

(3) As this bypasses libc, it can't use the libc freelists (which are
    used by malloc). To compensate for this, there's a simple
    process-level freelist of blocks, shared by all memory contexts.
    This cache a limited capacity (roughly 4MB per).

(4) Some of the comments are obsolete, still referencing malloc/free.


Benchmarks
----------

I've done extensive testing and also benchmrking, and it seems to be no
slower than the current implementation, and in some cases is actually a
bit faster.

a) time pgbench -i -s 300

   - pgbench initialisation, measuring the COPY and the total duration.
   - averages of 3 runs (negligible variations between runs)

               COPY           total
    ---------------------------------
    master     0:26.22         1:22
    mmap       0:26.35         1:22

   Pretty much no difference.

b) pgbench -S -c 8 -j 8 -T 60

   - short read-only runs (1 minute) after a warmup
   - min, max, average of 8 runs

               average      min      max
    -------------------------------------
    master       96785    95329    97981
    mmap         98279    97259    99671

   That's a rather consistent 1-2% improvement.

c) REINDEX pgbench_accounts_pkey

   - large maintenance_work_mem so that it's in-memory sort
   - average, min, max of 8 runs (duration in seconds)

               average      min      max
    -------------------------------------
    master       10.35    9.64     13.56
    mmap          9.85    9.81      9.90

    Again, mostly improvement, except for the minimum where the currect
    memory context was a bit faster. But overall the mmap-based one is
    much more consistent.

Some of the differences may be due to allocating 4kB blocks from the
very start (while the current allocator starts with 1kB, then 2kB and
finally 4kB).

Ideas, opinions?


[1] http://linux.die.net/man/2/sbrk
[2] http://linux.die.net/man/2/mmap

-- 
Tomas Vondra                http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 0759e39..65e7f66 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -89,6 +89,8 @@
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
+#include <sys/mman.h>
+
 /* Define this to detail debug alloc information */
 /* #define HAVE_ALLOCINFO */
 
@@ -117,13 +119,11 @@
  * wastage.)
  *--------------------
  */
-
+#define PAGE_SIZE			4096	/* getconf PAGE_SIZE */
 #define ALLOC_MINBITS		3	/* smallest chunk size is 8 bytes */
-#define ALLOCSET_NUM_FREELISTS	11
+#define ALLOCSET_NUM_FREELISTS	11	/* FIXME probably should be only 9 */
 #define ALLOC_CHUNK_LIMIT	(1 << (ALLOCSET_NUM_FREELISTS-1+ALLOC_MINBITS))
-/* Size of largest chunk that we use a fixed size for */
-#define ALLOC_CHUNK_FRACTION	4
-/* We allow chunks to be at most 1/4 of maxBlockSize (less overhead) */
+/* Size of largest chunk that we use a fixed size for (2kB) */
 
 /*--------------------
  * The first block allocated for an allocset has size initBlockSize.
@@ -242,6 +242,31 @@ typedef struct AllocChunkData
 #define AllocChunkGetPointer(chk)	\
 					((AllocPointer)(((char *)(chk)) + ALLOC_CHUNKHDRSZ))
 
+#ifdef MMAP_STATS
+Size	allocated_bytes = 0;
+Size	allocated_chunks = 0;
+
+Size	freed_bytes = 0;
+Size	freed_chunks = 0;
+
+Size	reuse_count = 0;
+
+int		freecounts[ALLOCSET_NUM_FREELISTS] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+static void	print_stats(void);
+static void print_freelist_stats(void);
+#endif
+
+/*
+ * cache/freelist of mmap-ed blocks (shared by all contexts within the process)
+ *
+ * The cache has a limited capacity, roughly ~4MB, the index is the number
+ * of pages (4kB) of a block, minus 1. So the cache can store 4kB, 8kB, 12kB,
+ * 16kB, 20kB and 24kB blocks, with the initial capacity for each size.
+ */
+static AllocBlock	mmap_cache_blocks[6] = {NULL, NULL, NULL, NULL, NULL, NULL};
+static int			mmap_cache_slots[6]  = {256,  128, 64, 32, 16, 16};
+
 /*
  * These functions implement the MemoryContext API for AllocSet contexts.
  */
@@ -450,11 +475,11 @@ AllocSetContextCreate(MemoryContext parent,
 	/*
 	 * Make sure alloc parameters are reasonable, and save them.
 	 *
-	 * We somewhat arbitrarily enforce a minimum 1K block size.
+	 * mmap can't really allocate less than PAGE_SIZE blocks.
 	 */
 	initBlockSize = MAXALIGN(initBlockSize);
-	if (initBlockSize < 1024)
-		initBlockSize = 1024;
+	if (initBlockSize < PAGE_SIZE)
+		initBlockSize = PAGE_SIZE;
 	maxBlockSize = MAXALIGN(maxBlockSize);
 	if (maxBlockSize < initBlockSize)
 		maxBlockSize = initBlockSize;
@@ -464,24 +489,16 @@ AllocSetContextCreate(MemoryContext parent,
 	context->nextBlockSize = initBlockSize;
 
 	/*
-	 * Compute the allocation chunk size limit for this context.  It can't be
-	 * more than ALLOC_CHUNK_LIMIT because of the fixed number of freelists.
-	 * If maxBlockSize is small then requests exceeding the maxBlockSize, or
-	 * even a significant fraction of it, should be treated as large chunks
-	 * too.  For the typical case of maxBlockSize a power of 2, the chunk size
-	 * limit will be at most 1/8th maxBlockSize, so that given a stream of
-	 * requests that are all the maximum chunk size we will waste at most
-	 * 1/8th of the allocated space.
-	 *
-	 * We have to have allocChunkLimit a power of two, because the requested
-	 * and actually-allocated sizes of any chunk must be on the same side of
-	 * the limit, else we get confused about whether the chunk is "big".
+	 * The current implementation only allocates 4kB blocks (matching the size
+	 * of memory page), and never increases it, so we're using static chunk
+	 * size limit 2kB. It's impossible to use smaller blocks (because mmap
+	 * can't allocate that anyway and allocates the whole 4kB anyway), but maybe
+	 * we could use 4kB and 8kB, to match the original AllocSet allocator.
+	 * Anyway, the 2kB limit still works (it matches the 1/4 fraction).
 	 */
 	context->allocChunkLimit = ALLOC_CHUNK_LIMIT;
-	while ((Size) (context->allocChunkLimit + ALLOC_CHUNKHDRSZ) >
-		   (Size) ((maxBlockSize - ALLOC_BLOCKHDRSZ) / ALLOC_CHUNK_FRACTION))
-		context->allocChunkLimit >>= 1;
 
+	minContextSize = PAGE_SIZE;
 	/*
 	 * Grab always-allocated space, if requested
 	 */
@@ -489,9 +506,20 @@ AllocSetContextCreate(MemoryContext parent,
 	{
 		Size		blksize = MAXALIGN(minContextSize);
 		AllocBlock	block;
+		int			idx = (blksize / PAGE_SIZE) - 1;
 
-		block = (AllocBlock) malloc(blksize);
-		if (block == NULL)
+		/* let's see if we can get the block from block freelist */
+		if ((idx < 6) && (mmap_cache_blocks[idx] != NULL))
+		{
+			block = mmap_cache_blocks[idx];
+			mmap_cache_blocks[idx] = block->next;
+			mmap_cache_slots[idx] += 1;
+		}
+		else
+			block = (AllocBlock) mmap(NULL, blksize, PROT_READ | PROT_WRITE,
+									  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		if (block == (void*)-1)
 		{
 			MemoryContextStats(TopMemoryContext);
 			ereport(ERROR,
@@ -500,6 +528,7 @@ AllocSetContextCreate(MemoryContext parent,
 					 errdetail("Failed while creating memory context \"%s\".",
 							   name)));
 		}
+
 		block->aset = context;
 		block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
 		block->endptr = ((char *) block) + blksize;
@@ -590,10 +619,22 @@ AllocSetReset(MemoryContext context)
 		else
 		{
 			/* Normal case, release the block */
+			Size	len = block->endptr - ((char *) block);
+			int		idx = (len / PAGE_SIZE) - 1;
+
 #ifdef CLOBBER_FREED_MEMORY
 			wipe_mem(block, block->freeptr - ((char *) block));
 #endif
-			free(block);
+			/* add the block to the block freelist, if possible */
+			if ((idx < 6) && (mmap_cache_slots[idx] > 0))
+			{
+				block->aset = NULL;
+				block->next = mmap_cache_blocks[idx];
+				mmap_cache_blocks[idx] = block;
+				mmap_cache_slots[idx] -= 1;
+			}
+			else
+				munmap(block, len);
 		}
 		block = next;
 	}
@@ -631,11 +672,23 @@ AllocSetDelete(MemoryContext context)
 	while (block != NULL)
 	{
 		AllocBlock	next = block->next;
+		Size		len = block->endptr - ((char *) block);
+		int			idx = (len / PAGE_SIZE) - 1;
 
 #ifdef CLOBBER_FREED_MEMORY
 		wipe_mem(block, block->freeptr - ((char *) block));
 #endif
-		free(block);
+		/* add the block to the global block freelist, if possible */
+		if ((idx < 6) && (mmap_cache_slots[idx] > 0))
+		{
+			block->aset = NULL;
+			block->next = mmap_cache_blocks[idx];
+			mmap_cache_blocks[idx] = block;
+			mmap_cache_slots[idx] -= 1;
+		}
+		else
+			munmap(block, len);
+
 		block = next;
 	}
 }
@@ -661,17 +714,38 @@ AllocSetAlloc(MemoryContext context, Size size)
 
 	AssertArg(AllocSetIsValid(set));
 
+	chunk_size = MAXALIGN(size);
+
 	/*
-	 * If requested size exceeds maximum for chunks, allocate an entire block
-	 * for this request.
+	 * If requested size exceeds PAGE_SIZE, allocate a separate block for this request.
 	 */
-	if (size > set->allocChunkLimit)
+	if (chunk_size >= set->allocChunkLimit)
 	{
-		chunk_size = MAXALIGN(size);
+		int			idx;
+
+		/* what size would this require, if alone in the block */
 		blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
-		block = (AllocBlock) malloc(blksize);
-		if (block == NULL)
+		blksize = ((blksize + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE;
+
+		idx = (blksize / PAGE_SIZE) - 1;
+
+		/* use the whole block for this single chunk */
+		chunk_size = blksize - (ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ);
+
+		/* let's see if we can get the block from cache */
+		if ((idx < 6) && (mmap_cache_blocks[idx] != NULL))
+		{
+			block = mmap_cache_blocks[idx];
+			mmap_cache_blocks[idx] = block->next;
+			mmap_cache_slots[idx] += 1;
+		}
+		else
+			block = (AllocBlock) mmap(NULL, blksize, PROT_READ | PROT_WRITE,
+									  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		if (block == (void*)-1)
 			return NULL;
+
 		block->aset = set;
 		block->freeptr = block->endptr = ((char *) block) + blksize;
 
@@ -689,7 +763,6 @@ AllocSetAlloc(MemoryContext context, Size size)
 		/* fill the allocated space with junk */
 		randomize_mem((char *) AllocChunkGetPointer(chunk), size);
 #endif
-
 		/*
 		 * Stick the new block underneath the active allocation block, so that
 		 * we don't lose the use of the space remaining therein.
@@ -716,6 +789,12 @@ AllocSetAlloc(MemoryContext context, Size size)
 		VALGRIND_MAKE_MEM_NOACCESS((char *) chunk + ALLOC_CHUNK_PUBLIC,
 						 chunk_size + ALLOC_CHUNKHDRSZ - ALLOC_CHUNK_PUBLIC);
 
+#ifdef MMAP_STATS
+		allocated_chunks += 1;
+		allocated_bytes += blksize;
+		print_stats();
+#endif
+
 		return AllocChunkGetPointer(chunk);
 	}
 
@@ -750,6 +829,13 @@ AllocSetAlloc(MemoryContext context, Size size)
 #endif
 
 		AllocAllocInfo(set, chunk);
+
+#ifdef MMAP_STATS
+		reuse_count += 1;
+		freecounts[fidx] -= 1;
+		print_stats();
+#endif
+
 		return AllocChunkGetPointer(chunk);
 	}
 
@@ -812,6 +898,9 @@ AllocSetAlloc(MemoryContext context, Size size)
 #endif
 				chunk->aset = (void *) set->freelist[a_fidx];
 				set->freelist[a_fidx] = chunk;
+#ifdef MMAP_STATS
+				freecounts[a_fidx] += 1;
+#endif
 			}
 
 			/* Mark that we need to create a new block */
@@ -825,40 +914,34 @@ AllocSetAlloc(MemoryContext context, Size size)
 	if (block == NULL)
 	{
 		Size		required_size;
-
-		/*
-		 * The first such block has size initBlockSize, and we double the
-		 * space in each succeeding block, but not more than maxBlockSize.
-		 */
-		blksize = set->nextBlockSize;
-		set->nextBlockSize <<= 1;
-		if (set->nextBlockSize > set->maxBlockSize)
-			set->nextBlockSize = set->maxBlockSize;
+		int			idx;
 
 		/*
 		 * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more
 		 * space... but try to keep it a power of 2.
 		 */
+
 		required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
-		while (blksize < required_size)
-			blksize <<= 1;
+		blksize = ((required_size + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE;
 
-		/* Try to allocate it */
-		block = (AllocBlock) malloc(blksize);
+		idx = (blksize / PAGE_SIZE) - 1;
 
-		/*
-		 * We could be asking for pretty big blocks here, so cope if malloc
-		 * fails.  But give up if there's less than a meg or so available...
-		 */
-		while (block == NULL && blksize > 1024 * 1024)
+		if ((idx < 6) && (mmap_cache_blocks[idx] != NULL))
 		{
-			blksize >>= 1;
-			if (blksize < required_size)
-				break;
-			block = (AllocBlock) malloc(blksize);
+			block = mmap_cache_blocks[idx];
+			mmap_cache_blocks[idx] = block->next;
+			mmap_cache_slots[idx] += 1;
 		}
+		else
+			/* Try to allocate it */
+			block = (AllocBlock) mmap(NULL, blksize, PROT_READ | PROT_WRITE,
+									  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
-		if (block == NULL)
+#ifdef MMAP_STATS
+		allocated_bytes += blksize;
+#endif
+
+		if (block == (void*)-1)
 			return NULL;
 
 		block->aset = set;
@@ -912,6 +995,12 @@ AllocSetAlloc(MemoryContext context, Size size)
 #endif
 
 	AllocAllocInfo(set, chunk);
+
+#ifdef MMAP_STATS
+	allocated_chunks += 1;
+	print_stats();
+#endif
+
 	return AllocChunkGetPointer(chunk);
 }
 
@@ -945,6 +1034,8 @@ AllocSetFree(MemoryContext context, void *pointer)
 		 */
 		AllocBlock	block = set->blocks;
 		AllocBlock	prevblock = NULL;
+		Size		len;
+		int			idx;
 
 		while (block != NULL)
 		{
@@ -964,10 +1055,29 @@ AllocSetFree(MemoryContext context, void *pointer)
 			set->blocks = block->next;
 		else
 			prevblock->next = block->next;
+#ifdef MMAP_STATS
+		freed_bytes += block->endptr - ((char *) block);
+#endif
+		len = block->endptr - ((char *) block);
+		idx = (len / PAGE_SIZE) - 1;
+
 #ifdef CLOBBER_FREED_MEMORY
 		wipe_mem(block, block->freeptr - ((char *) block));
 #endif
-		free(block);
+#ifdef MMAP_STATS
+		freed_chunks += 1;
+#endif
+
+		/* add the block to the cache, if possible */
+		if ((idx < 6) && (mmap_cache_slots[idx] > 0))
+		{
+			block->aset = NULL;
+			block->next = mmap_cache_blocks[idx];
+			mmap_cache_blocks[idx] = block;
+			mmap_cache_slots[idx] -= 1;
+		}
+		else
+			munmap(block, len);
 	}
 	else
 	{
@@ -985,6 +1095,9 @@ AllocSetFree(MemoryContext context, void *pointer)
 		chunk->requested_size = 0;
 #endif
 		set->freelist[fidx] = chunk;
+#ifdef MMAP_STATS
+		freecounts[fidx] += 1;
+#endif
 	}
 }
 
@@ -1087,6 +1200,7 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size)
 		}
 		if (block == NULL)
 			elog(ERROR, "could not find block containing chunk %p", chunk);
+
 		/* let's just make sure chunk is the only one in the block */
 		Assert(block->freeptr == ((char *) block) +
 			   (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ));
@@ -1094,9 +1208,12 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size)
 		/* Do the realloc */
 		chksize = MAXALIGN(size);
 		blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
-		block = (AllocBlock) realloc(block, blksize);
-		if (block == NULL)
+		blksize = ((blksize + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE;
+
+		block = (AllocBlock) mremap(block, block->endptr - ((char*)block), blksize, MREMAP_MAYMOVE);
+		if (block == (void*)-1)
 			return NULL;
+
 		block->freeptr = block->endptr = ((char *) block) + blksize;
 
 		/* Update pointers since block has likely been moved */
@@ -1106,7 +1223,8 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size)
 			set->blocks = block;
 		else
 			prevblock->next = block;
-		chunk->size = chksize;
+
+		chunk->size = blksize - (ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ);
 
 #ifdef MEMORY_CONTEXT_CHECKING
 #ifdef RANDOMIZE_ALLOCATED_MEMORY
@@ -1361,3 +1479,22 @@ AllocSetCheck(MemoryContext context)
 }
 
 #endif   /* MEMORY_CONTEXT_CHECKING */
+
+#ifdef MMAP_STATS
+static void	print_stats()
+{
+	if ((allocated_chunks > 0) && (allocated_chunks % 1000 == 0))
+	{
+		printf("allocated bytes=%ld count=%ld freed bytes=%ld count=%ld reused count=%ld\n",
+			   allocated_bytes, allocated_chunks, freed_bytes, freed_chunks, reuse_count);
+		print_freelist_stats();
+	}
+}
+
+static void print_freelist_stats()
+{
+	printf("freelist [%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d]\n",
+			freecounts[0], freecounts[1], freecounts[2], freecounts[3], freecounts[4], freecounts[5],
+			freecounts[6], freecounts[7], freecounts[8], freecounts[9], freecounts[10]);
+}
+#endif
\ No newline at end of file
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to