There was very interesting presentation at pgconf about pg_prefaulter:

http://www.pgcon.org/2018/schedule/events/1204.en.html

But it is implemented in GO and using pg_waldump.
I tried to do the same but using built-on Postgres WAL traverse functions.
I have implemented it as extension for simplicity of integration.
In principle it can be started as BG worker.

First of all I tried to estimate effect of preloading data.
I have implemented prefetch utility with is also attached to this mail.
It performs random reads of blocks of some large file and spawns some number of prefetch threads:

Just normal read without prefetch:
./prefetch -n 0 SOME_BIG_FILE

One prefetch thread which uses pread:
./prefetch SOME_BIG_FILE

One prefetch thread which uses posix_fadvise:
./prefetch -f SOME_BIG_FILE

4 prefetch thread which uses posix_fadvise:
./prefetch -f -n 4 SOME_BIG_FILE

Based on this experiments (on my desktop), I made the following conclusions:

1. Prefetch at HDD doesn't give any positive effect.
2. Using posix_fadvise allows to speed-up random read speed at SSD up to 2 times.
3. posix_fadvise(WILLNEED) is more efficient than performing normal reads.
4. Calling posix_fadvise in more than one thread has no sense.

I have tested wal_prefetch at two powerful servers with 24 cores, 3Tb NVME RAID 10 storage device and 256Gb of RAM connected using InfiniBand. The speed of synchronous replication between two nodes is increased from 56k TPS to 60k TPS (on pgbench with scale 1000).

Usage:
1. At master: create extension wal_prefetch
2. At replica: Call pg_wal_prefetch() function: it will not return until you interrupt it.

pg_wal_prefetch function will infinitely traverse WAL and prefetch block references in WAL records
using posix_fadvise(WILLNEED) system call.

It is possible to explicitly specify start LSN for pg_wal_prefetch() function. Otherwise, WAL redo position will be used as start LSN.


--

Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#include <pthread.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <assert.h>

#define BLOCK_SIZE 8192
#define INIT_SEED 1999
#define MAX_THREADS 8

size_t file_size;
char* file_name;
size_t n_prefetchers = 1;
int  use_fadvise;
int n_iterations = 10*1024*1024;

static off_t random_offset(uint64_t* seed)
{
	off_t rnd =  (3141592621u * *seed + 2718281829u) % 1000000007u;
	*seed = rnd;
	return rnd % file_size * BLOCK_SIZE;
}

void reader(void)
{
	uint64_t seed = INIT_SEED;
	char page[BLOCK_SIZE];
	time_t start = time(NULL);
	int i;
	int fd = open(file_name, O_RDONLY);
	assert(fd >= 0);

	for (i = 0; i < n_iterations; i++) {
		off_t offs = random_offset(&seed);
		ssize_t rc = pread(fd, page, sizeof page, offs);
		time_t now;
		assert(rc == BLOCK_SIZE);
		now = time(NULL);
		if (i % 1024 == 0 && now != start) {
			printf("%d: %.2f Mb/sec   \r", i/1024, (double)(i+1)*BLOCK_SIZE/1024/1024/(now - start));
			fflush(stdout);
		}
	}
}

void* prefetcher(void* arg)
{
	size_t id = (size_t)arg;
	uint64_t seed = INIT_SEED;
	char page[BLOCK_SIZE];
	int fd = open(file_name, O_RDONLY);
	int i;
	assert(fd >= 0);

	for (i = 0;;i++) {
		off_t offs = random_offset(&seed);
		if (i % n_prefetchers == id) { 
			if (use_fadvise) {
				int rc = posix_fadvise(fd, offs, BLOCK_SIZE, POSIX_FADV_WILLNEED);
				assert(rc == 0);
			} else { 
				ssize_t rc = pread(fd, page, sizeof page, offs);
				assert(rc == BLOCK_SIZE);
			}
		}
	}
	return 0;
}



int main(int argc, char* argv[])
{
	pthread_t prefetchers[MAX_THREADS];
	int i;
	int fd;

	for (i = 1; i < argc; i++) {
		if (argv[i][0] == '-') {
			switch (argv[i][1]) {
			  case 'f':
				use_fadvise = 1;
				continue;
			  case 'n':
				n_prefetchers = atoi(argv[++i]);
				continue;
			  case 'i':
				n_iterations = atoi(argv[++i]);
				continue;
			  default:
			  help:
				fprintf(stderr, "prefetch [-f] [-n THREADS] [-i ITERATIONS] file\n");
				return 1;
			}
		} else {
			file_name = argv[i];
		}
	}
	if (file_name == NULL) {
		goto help;
	}
  	fd = open(file_name, O_RDONLY);
	assert(fd >= 0);
	file_size = lseek(fd, 0, SEEK_END)/BLOCK_SIZE;
	assert(file_size != 0);

	for (i = 0; i < n_prefetchers; i++) {
		pthread_create(&prefetchers[i], NULL, prefetcher, (void*)(size_t)i);
	}

	reader();
	puts("\nDone");
	return 0;
}

Attachment: wal_prefetch.tgz
Description: application/compressed-tar

Reply via email to