On Wed, Feb 12, 2025 at 9:57 PM Robert Haas <[email protected]> wrote:
>
> On Wed, Feb 12, 2025 at 3:07 PM Tomas Vondra <[email protected]> wrote:
> > AFAICS the "1" value is simply one of the many "defensive" defaults in
> > our sample config. It's much more likely to help than cause harm, even
> > on smaller/older systems, but for many systems a higher value would be
> > more appropriate. There's usually a huge initial benefit (say, going to
> > 16 or 32), and then the benefits diminish fairly quickly.
>
> I'm happy to see us change the value to something that is likely to be
> good for most people. I think it's OK if people on very tiny systems
> need to change a few defaults for optimum performance. We should keep
> in mind that people do sometimes run PostgreSQL on fairly small VMs
> and not go crazy with it, but there's no reason to pretend that the
> typical database runs on a Raspberry Pi.
Is there any reason we couldn't have new pg_test_iorates (similiar to
other pg_test_* proggies), that would literally do this and calibrate
best e_io_c during initdb and put the result into postgresql.auto.conf
(pg_test_iorates --adjust-auto-conf) , that way we would avoid user
questions on how to come with optimal value?
root@jw-test3:/nvme# ./pg_test_iorates
File size: 16.00 GB, Block size: 8192 bytes
buffered sequential read: 1573.33 MB/s
direct sequential read: 289.46 MB/s
random read performance with different prefetch distances:
using prefetch distance 1: 173.58 MB/s
using prefetch distance 2: 243.05 MB/s
using prefetch distance 4: 376.78 MB/s
using prefetch distance 8: 590.66 MB/s
using prefetch distance 16: 824.49 MB/s
using prefetch distance 32: 861.45 MB/s
using prefetch distance 64: 830.36 MB/s
Attached, compile naive way via: gcc pg_test_iorates.c -o
pg_test_iorates -I /git/postgres/build/src/include -I
/git/postgres/src/include -L /git/postgres/build/src/common
-L/git/postgres/build/src/port -lpgcommon -lpgport -lm
-J.
/*-------------------------------------------------------------------------
*
* pg_test_iorates --- measures I/O and prefetch distance impact
*
* Win32 not tested
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include "postgres_fe.h"
#include <limits.h>
#include <sys/stat.h>
#include <sys/time.h>
#define _GNU_SOURCE
#define __USE_GNU 1
#include <fcntl.h>
#include <time.h>
#include <unistd.h>
#include <signal.h>
#include "common/logging.h"
#include "common/pg_prng.h"
#include "getopt_long.h"
#define DEFAULT_FILE_SIZE 16 * 1024 * 1024 * 1024 /* 16 GB */
#define DEFAULT_BLOCK_SIZE 8192 /* 8 KB */
#define DEFAULT_TEST_TIME 3 /* seconds */
static const char *progname;
static char *test_filename = "pg_test_io.tmp";
static int needs_unlink = 0;
static ssize_t file_size = (ssize_t)DEFAULT_FILE_SIZE;
static size_t block_size = DEFAULT_BLOCK_SIZE;
static unsigned test_seconds = DEFAULT_TEST_TIME;
static volatile sig_atomic_t alarm_triggered;
static void handle_args(int argc, char **argv);
static void create_test_file(void);
static void cleanup(void);
static void signal_handler(SIGNAL_ARGS);
static void test_sequential_read(int use_direct);
static void test_random_read(int use_direct, int use_advise, int
prefetch_distance);
#ifndef WIN32
#define DIRECT_FLAG O_DIRECT
#else
#define DIRECT_FLAG _O_DIRECT
#endif
int
main(int argc, char **argv)
{
progname = get_progname(argv[0]);
pg_logging_init(progname);
handle_args(argc, argv);
#ifndef WIN32
signal(SIGALRM, signal_handler);
#endif
pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL));
printf("File size: %.2f GB, Block size: %zu bytes\n",
(double)file_size / (1024 * 1024 * 1024), block_size);
create_test_file();
test_sequential_read(false);
#ifdef DIRECT_FLAG
test_sequential_read(true);
#endif
printf("random read performance with different prefetch distances:\n");
int test_distances[] = {1, 2, 4, 8, 16, 32, 64};
int num_distances = sizeof(test_distances) / sizeof(int);
for (int i = 0; i < num_distances; i++) {
int d = test_distances[i];
printf("using prefetch distance %d: ", d);
fflush(stdout);
test_random_read(false, true, d);
}
cleanup();
return 0;
}
static void
handle_args(int argc, char **argv)
{
static struct option long_options[] = {
{"file", required_argument, NULL, 'f'},
{"size", required_argument, NULL, 's'},
{"block-size", required_argument, NULL, 'b'},
{"time", required_argument, NULL, 't'},
{NULL, 0, NULL, 0}
};
int c;
while ((c = getopt_long(argc, argv, "f:s:b:t:", long_options, NULL)) !=
-1) {
switch (c) {
case 'f':
test_filename = pg_strdup(optarg);
break;
case 's':
file_size = strtoul(optarg, NULL, 10);
break;
case 'b':
block_size = strtoul(optarg, NULL, 10);
break;
case 't':
test_seconds = strtoul(optarg, NULL, 10);
break;
default:
pg_fatal("Unknown argument");
}
}
}
static void
create_test_file(void)
{
int fd;
char *buf;
size_t remaining;
ssize_t written;
buf = pg_malloc(block_size);
if ((fd = open(test_filename, O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY,
0600)) < 0)
pg_fatal("could not create test file");
needs_unlink = 1;
/* Fill buffer with pseudo-random data */
for (size_t i = 0; i < block_size; i++)
buf[i] = (char)(i % 256);
/* Write file in blocks */
for (remaining = file_size; remaining > 0; remaining -= written) {
size_t current = (remaining > block_size) ? block_size
: remaining;
written = write(fd, buf, current);
if (written < 0)
pg_fatal("write failed");
}
fsync(fd);
close(fd);
free(buf);
}
static void
test_sequential_read(int use_direct)
{
int fd;
char *buf;
size_t total_read = 0;
struct timeval start, end;
double elapsed, mbps;
int flags = O_RDONLY | PG_BINARY;
#ifdef DIRECT_FLAG
if (use_direct)
flags |= DIRECT_FLAG;
#endif
buf = pg_malloc(block_size);
if (posix_memalign((void *)&buf, 512, block_size) < 0)
pg_fatal("posix_memalign failed: %m");;
if ((fd = open(test_filename, flags)) < 0)
pg_fatal("could not open test file");
printf("%s sequential read: ", use_direct ? "direct" : "buffered");
fflush(stdout);
alarm_triggered = false;
alarm(test_seconds);
gettimeofday(&start, NULL);
while (!alarm_triggered) {
ssize_t bytes_read;
off_t offset = 0;
while (offset < file_size && !alarm_triggered) {
bytes_read = pread(fd, buf, block_size, offset);
if (bytes_read < 0)
pg_fatal("read error, rc=%ld: %m", bytes_read);
total_read += bytes_read;
offset += bytes_read;
}
lseek(fd, 0, SEEK_SET);
}
gettimeofday(&end, NULL);
close(fd);
free(buf);
elapsed = (end.tv_sec - start.tv_sec) +
(end.tv_usec - start.tv_usec) / 1000000.0;
mbps = (total_read / (1024.0 * 1024.0)) / elapsed;
printf("%.2f MB/s\n", mbps);
}
static void
cleanup(void)
{
if (needs_unlink)
unlink(test_filename);
}
static void
signal_handler(SIGNAL_ARGS)
{
alarm_triggered = true;
}
static void
test_random_read(int use_direct, int use_advise, int prefetch_distance)
{
int fd;
char *buf;
size_t total_read = 0;
struct timeval start, end;
double elapsed, mbps;
size_t max_blocks = file_size / block_size;
off_t *prefetch_queue = NULL;
int qhead = 0;
int flags = O_RDONLY | PG_BINARY;
#ifdef DIRECT_FLAG
if (use_direct)
flags |= DIRECT_FLAG;
#endif
buf = pg_malloc(block_size);
/* FIXME: search for PG variant of this */
if (posix_memalign((void *)&buf, 512, block_size) < 0)
pg_fatal("posix_memalign failed");
if ((fd = open(test_filename, flags)) < 0)
pg_fatal("could not open test file");
/* Initialize prefetch queue if using advice */
if (use_advise) {
prefetch_queue = pg_malloc(prefetch_distance * sizeof(off_t));
/* Pre-fill prefetch queue */
for (int i = 0; i < prefetch_distance; i++) {
prefetch_queue[i] =
(pg_prng_uint64(&pg_global_prng_state) % max_blocks) * block_size;
posix_fadvise(fd, prefetch_queue[i], block_size,
POSIX_FADV_WILLNEED);
}
qhead = 0;
}
alarm_triggered = false;
alarm(test_seconds);
gettimeofday(&start, NULL);
while (!alarm_triggered) {
//for(int i = 0; i <= 32 ; i++) {
off_t offset;
if (use_advise) {
/* Get offset from prefetch queue */
offset = prefetch_queue[qhead];
/* Add new prefetch target */
off_t new_offset =
(pg_prng_uint64(&pg_global_prng_state) % max_blocks) * block_size;
posix_fadvise(fd, new_offset, block_size,
POSIX_FADV_WILLNEED);
prefetch_queue[qhead] = new_offset;
qhead = (qhead + 1) % prefetch_distance;
} else {
/* Simple random read without prefetch */
offset = (pg_prng_uint64(&pg_global_prng_state) %
max_blocks) * block_size;
}
ssize_t bytes_read = pread(fd, buf, block_size, offset);
if (bytes_read < 0)
pg_fatal("read error, rc=%ld: %m", bytes_read);
total_read += bytes_read;
}
gettimeofday(&end, NULL);
close(fd);
free(buf);
if (prefetch_queue)
free(prefetch_queue);
elapsed = (end.tv_sec - start.tv_sec) +
(end.tv_usec - start.tv_usec) / 1000000.0;
mbps = (total_read / (1024.0 * 1024.0)) / elapsed;
printf("%.2f MB/s\n", mbps);
}