On 8/14/25 01:19, Andres Freund wrote: > Hi, > > On 2025-08-14 01:11:07 +0200, Tomas Vondra wrote: >> On 8/13/25 23:57, Peter Geoghegan wrote: >>> On Wed, Aug 13, 2025 at 5:19 PM Tomas Vondra <to...@vondra.me> wrote: >>>> It's also not very surprising this happens with backwards scans more. >>>> The I/O is apparently much slower (due to missing OS prefetch), so we're >>>> much more likely to hit the I/O limits (max_ios and various other limits >>>> in read_stream_start_pending_read). >>> >>> But there's no OS prefetch with direct I/O. At most, there might be >>> some kind of readahead implemented in the SSD's firmware. >>> >> >> Good point, I keep forgetting direct I/O means no OS read-ahead. Not >> sure if there's a good way to determine if the SSD can do something like >> that (and how well). I wonder if there's a way to do backward sequential >> scans in fio .. > > In theory, yes, in practice, not quite: > https://github.com/axboe/fio/issues/1963 > > So right now it only works if you skip over some blocks. For that there rather > significant performance differences on my SSDs. E.g. > > andres@awork3:~/src/fio$ fio --directory /srv/fio --size=$((1024*1024*1024)) > --name test --bs=4k --rw read:8k --buffered 0 2>&1|grep READ > READ: bw=179MiB/s (188MB/s), 179MiB/s-179MiB/s (188MB/s-188MB/s), > io=341MiB (358MB), run=1907-1907msec > andres@awork3:~/src/fio$ fio --directory /srv/fio --size=$((1024*1024*1024)) > --name test --bs=4k --rw read:-8k --buffered 0 2>&1|grep READ > READ: bw=70.6MiB/s (74.0MB/s), 70.6MiB/s-70.6MiB/s (74.0MB/s-74.0MB/s), > io=1024MiB (1074MB), run=14513-14513msec > > So on this WD Red SN700 there's a rather substantial performance difference. > > On a Samsung 970 PRO I don't see much of a difference. Nor on a ADATA > SX8200PNP. >
I experimented with this a little bit today. Given the fio issues, I ended up writing a simple tool in C, doing pread() forward/backward with different block size and direct I/O. AFAICS this is roughly equivalent to fio with iodepth=1 (based on a couple tests). Too bad fio has issues with backward sequential tests ... I'll see if I can get at least some results to validate my results. On all my SSDs there's massive difference between forward and backward sequential scans. It depends on the block size, but for the smaller block sizes (1-16KB) it's roughly 4x slower. It gets better for larger blocks, but while that's interesting, we're stuck with 8K blocks. FWIW I'm not claiming this explains all odd things we're investigating in this thread, it's more a confirmation that the scan direction may matter if it translates to direction at the device level. I don't think it can explain the strange stuff with the "random" data sets constructed Peter. regards -- Tomas Vondra
seqscan.pdf
Description: Adobe PDF document
seqscan-backward-vs-forward.pdf
Description: Adobe PDF document
/* * A simple test of forward/backward sequential scans with direct I/O. * * Build like this: * * gcc -O2 -Werror -o direct-io-test direct-io-test.c * * Use like this: * * ./direct-io-test /PATH/TO/FILE SIZE TIME_LIMIT * * with size in gigabytes (e.g. 32 means 32GB). The file will be created * and populated with random data. Then it runs a number of test with * different block sizes (1KB to 8MB) and directions. Can take a lot of * time (1h or more, depending on the storage device). */ #define _GNU_SOURCE #include <unistd.h> #include <stdlib.h> #include <fcntl.h> #include <stdio.h> #include <sys/time.h> /* * generate a file with size_gb size, fill it with random data */ void generate_file(char *path, int size_gb) { /* longs to generate / write at once */ #define BUFF_SIZE 1024 long *buff; size_t buff_bytes = BUFF_SIZE * sizeof(long); long size = (size_gb * 1024L * 1024L * 1024L); long nwritten = 0; int fd = open(path, (O_WRONLY | O_CREAT | O_DIRECT), (S_IRUSR | S_IRUSR)); long path_memalign = pathconf(path,_PC_REC_XFER_ALIGN); if (posix_memalign((void **) &buff, path_memalign, buff_bytes) != 0) { printf("posix_memalign failed (bytes %u memalign %ld)\n", buff_bytes, path_memalign); fflush(NULL); abort(); } while (nwritten < size) { for (int i = 0; i < BUFF_SIZE; i++) { /* combine two 48-bit values into a 64-bit one */ buff[i] = ((lrand48() << 16) ^ lrand48()); } write(fd, buff, buff_bytes); nwritten += buff_bytes; } free(buff); fsync(fd); close(fd); } /* * Run forward/backward sequential scans, starting from 8MB blocks, and * going down to 1KB blocks. For each block test forward/backward scan. */ void test_direct_io(char *path, int size_gb, int run, int run_limit) { int bs; /* block size (in KB) */ int fd = open(path, (O_RDONLY | O_DIRECT), 0); long size_kbs = (size_gb * 1024L * 1024L); long size = (size_kbs * 1024L); long path_memalign = pathconf(path,_PC_REC_XFER_ALIGN); /* print header */ printf("%16s%8s%8s%16s%8s%16s\n", "direction", "run", "block", "usec", "sec", "KB/s"); fflush(NULL); /* start from largest block, go down to 1KB */ bs = 8192; while (bs > 0) { char *buff = NULL; long block_bytes = (bs * 1024L); long nblocks = (size / block_bytes); /* total */ long nblocks_read; #define MIN_CHECK_DISTANCE 1000 long nblocks_check; off_t pos; /* timing */ struct timeval ts, te; long usecs; double secs; posix_memalign((void **) &buff, path_memalign, block_bytes); /* forward direction */ gettimeofday(&ts, NULL); /* random starting position in the file (multiple of block size) */ pos = (lrand48() % nblocks) * block_bytes; nblocks_read = 0; nblocks_check = MIN_CHECK_DISTANCE; while (nblocks_read < nblocks) { size_t r = pread(fd, buff, block_bytes, pos); if (r == -1) { printf("pread failed %ld %m\n", r); fflush(NULL); abort(); } else if (r < block_bytes) /* incomplete read, just retry */ continue; pos += block_bytes; if (pos >= size) pos = 0; nblocks_read++; /* maybe check time limit */ if (nblocks_check == nblocks_read) { long nblocks_expected; gettimeofday(&te, NULL); usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec); secs = (double) usecs / 1000000L; /* run exceeded limit */ if (secs > run_limit) break; /* how many blocks we expect to read in the limit */ nblocks_expected = run_limit * (nblocks_read / secs); nblocks_check = (nblocks_expected + nblocks_read) / 2; if (nblocks_check < nblocks_read + MIN_CHECK_DISTANCE) nblocks_check = nblocks_read + MIN_CHECK_DISTANCE; } } gettimeofday(&te, NULL); usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec); secs = (double) usecs / 1000000L; /* print info about the forward scan */ printf("%16s%8d%8d%16ld%8.2f%16.2f\n", "forward", run, bs, usecs, secs, (nblocks_read * block_bytes / 1024L) / secs); fflush(NULL); /* backward direction */ gettimeofday(&ts, NULL); /* random starting position in the file (multiple of block size) */ pos = (lrand48() % nblocks) * block_bytes; nblocks_read = 0; nblocks_check = MIN_CHECK_DISTANCE; while (nblocks_read < nblocks) /* number of blocks remaining */ { size_t r = pread(fd, buff, block_bytes, pos); if (r == -1) { printf("pread failed %ld %m\n", r); fflush(NULL); abort(); } else if (r < block_bytes) /* incomplete read, just retry */ { printf("retry write\n"); fflush(NULL); continue; } pos -= block_bytes; if (pos < 0) pos = size - block_bytes; nblocks_read++; /* maybe check time limit */ if (nblocks_check == nblocks_read) { long nblocks_expected; gettimeofday(&te, NULL); usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec); secs = (double) usecs / 1000000L; /* run exceeded limit */ if (secs > run_limit) break; /* how many blocks we expect to read in the limit */ nblocks_expected = run_limit * (nblocks_read / secs); nblocks_check = (nblocks_expected + nblocks_read) / 2; if (nblocks_check < nblocks_read + MIN_CHECK_DISTANCE) nblocks_check = nblocks_read + MIN_CHECK_DISTANCE; } } gettimeofday(&te, NULL); usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec); secs = (double) usecs / 1000000L; /* print info about the backward scan */ printf("%16s%8d%8d%16ld%8.2f%16.2f\n", "backward", run, bs, usecs, secs, (nblocks_read * block_bytes / 1024L) / secs); fflush(NULL); free(buff); /* proceed to smaller block */ bs /= 2; } close(fd); } int main(int argc, char **argv) { char *path = argv[1]; int size_gb = atoi(argv[2]); int runs = atoi(argv[3]); int limit = atoi(argv[4]); printf("path = %s\n", path); printf("size = %d GB\n", size_gb); printf("runs = %d\n", runs); printf("limit = %d\n", limit); fflush(NULL); generate_file(path, size_gb); printf("file generated\n"); fflush(NULL); /* */ for (int r = 1; r <= runs; r++) test_direct_io(path, size_gb, r, limit); return 0; }