On Wed, Feb 12, 2025 at 9:57 PM Robert Haas <robertmh...@gmail.com> wrote:
>
> On Wed, Feb 12, 2025 at 3:07 PM Tomas Vondra <to...@vondra.me> wrote:
> > AFAICS the "1" value is simply one of the many "defensive" defaults in
> > our sample config. It's much more likely to help than cause harm, even
> > on smaller/older systems, but for many systems a higher value would be
> > more appropriate. There's usually a huge initial benefit (say, going to
> > 16 or 32), and then the benefits diminish fairly quickly.
>
> I'm happy to see us change the value to something that is likely to be
> good for most people. I think it's OK if people on very tiny systems
> need to change a few defaults for optimum performance. We should keep
> in mind that people do sometimes run PostgreSQL on fairly small VMs
> and not go crazy with it, but there's no reason to pretend that the
> typical database runs on a Raspberry Pi.

Is there any reason we couldn't have new pg_test_iorates (similiar to
other pg_test_* proggies), that would literally do this and calibrate
best e_io_c during initdb and put the result into postgresql.auto.conf
(pg_test_iorates --adjust-auto-conf) , that way we would avoid user
questions on how to come with optimal value?

root@jw-test3:/nvme# ./pg_test_iorates
File size: 16.00 GB, Block size: 8192 bytes
buffered sequential read: 1573.33 MB/s
direct sequential read: 289.46 MB/s
random read performance with different prefetch distances:
using prefetch distance 1: 173.58 MB/s
using prefetch distance 2: 243.05 MB/s
using prefetch distance 4: 376.78 MB/s
using prefetch distance 8: 590.66 MB/s
using prefetch distance 16: 824.49 MB/s
using prefetch distance 32: 861.45 MB/s
using prefetch distance 64: 830.36 MB/s

Attached, compile naive way via: gcc pg_test_iorates.c -o
pg_test_iorates -I /git/postgres/build/src/include -I
/git/postgres/src/include -L /git/postgres/build/src/common
-L/git/postgres/build/src/port -lpgcommon -lpgport -lm

-J.
/*-------------------------------------------------------------------------
 *
 * pg_test_iorates --- measures I/O and prefetch distance impact
 *
 * Win32 not tested
 *
 *-------------------------------------------------------------------------
 */

#include "c.h"
#include "postgres_fe.h"
#include <limits.h>
#include <sys/stat.h>
#include <sys/time.h>
#define _GNU_SOURCE
#define __USE_GNU 1
#include <fcntl.h>
#include <time.h>
#include <unistd.h>
#include <signal.h>
#include "common/logging.h"
#include "common/pg_prng.h"
#include "getopt_long.h"

#define DEFAULT_FILE_SIZE 16 * 1024 * 1024 * 1024       /* 16 GB */
#define DEFAULT_BLOCK_SIZE 8192 /* 8 KB */
#define DEFAULT_TEST_TIME 3     /* seconds */

static const char *progname;
static char    *test_filename = "pg_test_io.tmp";
static int      needs_unlink = 0;
static ssize_t file_size = (ssize_t)DEFAULT_FILE_SIZE;
static size_t block_size = DEFAULT_BLOCK_SIZE;
static unsigned test_seconds = DEFAULT_TEST_TIME;
static volatile sig_atomic_t alarm_triggered;

static void     handle_args(int argc, char **argv);
static void     create_test_file(void);
static void     cleanup(void);
static void     signal_handler(SIGNAL_ARGS);
static void     test_sequential_read(int use_direct);
static void     test_random_read(int use_direct, int use_advise, int 
prefetch_distance);

#ifndef WIN32
#define DIRECT_FLAG O_DIRECT
#else
#define DIRECT_FLAG _O_DIRECT
#endif

int
main(int argc, char **argv)
{
        progname = get_progname(argv[0]);
        pg_logging_init(progname);

        handle_args(argc, argv);

#ifndef WIN32
        signal(SIGALRM, signal_handler);
#endif

        pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL));
        printf("File size: %.2f GB, Block size: %zu bytes\n",
               (double)file_size / (1024 * 1024 * 1024), block_size);

        create_test_file();

        test_sequential_read(false);
#ifdef DIRECT_FLAG
        test_sequential_read(true);
#endif

        printf("random read performance with different prefetch distances:\n");
        int             test_distances[] = {1, 2, 4, 8, 16, 32, 64};
        int             num_distances = sizeof(test_distances) / sizeof(int);

        for (int i = 0; i < num_distances; i++) {
                int             d = test_distances[i];
                printf("using prefetch distance %d: ", d);
                fflush(stdout);
                test_random_read(false, true, d);
        }

        cleanup();
        return 0;
}

static void
handle_args(int argc, char **argv)
{
        static struct option long_options[] = {
                {"file", required_argument, NULL, 'f'},
                {"size", required_argument, NULL, 's'},
                {"block-size", required_argument, NULL, 'b'},
                {"time", required_argument, NULL, 't'},
                {NULL, 0, NULL, 0}
        };

        int             c;
        while ((c = getopt_long(argc, argv, "f:s:b:t:", long_options, NULL)) != 
-1) {
                switch (c) {
                case 'f':
                        test_filename = pg_strdup(optarg);
                        break;
                case 's':
                        file_size = strtoul(optarg, NULL, 10);
                        break;
                case 'b':
                        block_size = strtoul(optarg, NULL, 10);
                        break;
                case 't':
                        test_seconds = strtoul(optarg, NULL, 10);
                        break;
                default:
                        pg_fatal("Unknown argument");
                }
        }
}

static void
create_test_file(void)
{
        int             fd;
        char           *buf;
        size_t          remaining;
        ssize_t         written;

        buf = pg_malloc(block_size);
        if ((fd = open(test_filename, O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY, 
0600)) < 0)
                pg_fatal("could not create test file");

        needs_unlink = 1;

        /* Fill buffer with pseudo-random data */
        for (size_t i = 0; i < block_size; i++)
                buf[i] = (char)(i % 256);

        /* Write file in blocks */
        for (remaining = file_size; remaining > 0; remaining -= written) {
                size_t          current = (remaining > block_size) ? block_size 
: remaining;
                written = write(fd, buf, current);
                if (written < 0)
                        pg_fatal("write failed");
        }

        fsync(fd);
        close(fd);

        free(buf);
}

static void
test_sequential_read(int use_direct)
{
        int             fd;
        char           *buf;
        size_t          total_read = 0;
        struct timeval  start, end;
        double          elapsed, mbps;
        int             flags = O_RDONLY | PG_BINARY;

#ifdef DIRECT_FLAG
        if (use_direct)
                flags |= DIRECT_FLAG;
#endif

        buf = pg_malloc(block_size);
        if (posix_memalign((void *)&buf, 512, block_size) < 0)
                pg_fatal("posix_memalign failed: %m");;

        if ((fd = open(test_filename, flags)) < 0)
                pg_fatal("could not open test file");

        printf("%s sequential read: ", use_direct ? "direct" : "buffered");
        fflush(stdout);

        alarm_triggered = false;
        alarm(test_seconds);
        gettimeofday(&start, NULL);

        while (!alarm_triggered) {
                ssize_t         bytes_read;
                off_t           offset = 0;

                while (offset < file_size && !alarm_triggered) {
                        bytes_read = pread(fd, buf, block_size, offset);
                        if (bytes_read < 0)
                                pg_fatal("read error, rc=%ld: %m", bytes_read);

                        total_read += bytes_read;
                        offset += bytes_read;
                }
                lseek(fd, 0, SEEK_SET);
        }

        gettimeofday(&end, NULL);
        close(fd);
        free(buf);

        elapsed = (end.tv_sec - start.tv_sec) +
                (end.tv_usec - start.tv_usec) / 1000000.0;

        mbps = (total_read / (1024.0 * 1024.0)) / elapsed;
        printf("%.2f MB/s\n", mbps);
}

static void
cleanup(void)
{
        if (needs_unlink)
                unlink(test_filename);
}

static void
signal_handler(SIGNAL_ARGS)
{
        alarm_triggered = true;
}

static void
test_random_read(int use_direct, int use_advise, int prefetch_distance)
{
        int             fd;
        char           *buf;
        size_t          total_read = 0;
        struct timeval  start, end;
        double          elapsed, mbps;
        size_t          max_blocks = file_size / block_size;
        off_t          *prefetch_queue = NULL;
        int             qhead = 0;
        int             flags = O_RDONLY | PG_BINARY;

#ifdef DIRECT_FLAG
        if (use_direct)
                flags |= DIRECT_FLAG;
#endif

        buf = pg_malloc(block_size);
        /* FIXME: search for PG variant of this */
        if (posix_memalign((void *)&buf, 512, block_size) < 0)
                pg_fatal("posix_memalign failed");

        if ((fd = open(test_filename, flags)) < 0)
                pg_fatal("could not open test file");

        /* Initialize prefetch queue if using advice */
        if (use_advise) {
                prefetch_queue = pg_malloc(prefetch_distance * sizeof(off_t));

                /* Pre-fill prefetch queue */
                for (int i = 0; i < prefetch_distance; i++) {
                        prefetch_queue[i] = 
(pg_prng_uint64(&pg_global_prng_state) % max_blocks) * block_size;
                        posix_fadvise(fd, prefetch_queue[i], block_size, 
POSIX_FADV_WILLNEED);
                }
                qhead = 0;
        }

        alarm_triggered = false;
        alarm(test_seconds);
        gettimeofday(&start, NULL);

        while (!alarm_triggered) {
        //for(int i = 0; i <= 32 ; i++) {
                off_t           offset;

                if (use_advise) {
                        /* Get offset from prefetch queue */
                        offset = prefetch_queue[qhead];

                        /* Add new prefetch target */
                        off_t           new_offset = 
(pg_prng_uint64(&pg_global_prng_state) % max_blocks) * block_size;
                        posix_fadvise(fd, new_offset, block_size, 
POSIX_FADV_WILLNEED);
                        prefetch_queue[qhead] = new_offset;
                        qhead = (qhead + 1) % prefetch_distance;
                } else {
                        /* Simple random read without prefetch */
                        offset = (pg_prng_uint64(&pg_global_prng_state) % 
max_blocks) * block_size;
                }

                ssize_t         bytes_read = pread(fd, buf, block_size, offset);
                if (bytes_read < 0)
                        pg_fatal("read error, rc=%ld: %m", bytes_read);

                total_read += bytes_read;
        }

        gettimeofday(&end, NULL);
        close(fd);
        free(buf);
        if (prefetch_queue)
                free(prefetch_queue);

        elapsed = (end.tv_sec - start.tv_sec) +
                (end.tv_usec - start.tv_usec) / 1000000.0;

        mbps = (total_read / (1024.0 * 1024.0)) / elapsed;
        printf("%.2f MB/s\n", mbps);
}

Reply via email to