On 8/14/25 01:19, Andres Freund wrote:
> Hi,
> 
> On 2025-08-14 01:11:07 +0200, Tomas Vondra wrote:
>> On 8/13/25 23:57, Peter Geoghegan wrote:
>>> On Wed, Aug 13, 2025 at 5:19 PM Tomas Vondra <to...@vondra.me> wrote:
>>>> It's also not very surprising this happens with backwards scans more.
>>>> The I/O is apparently much slower (due to missing OS prefetch), so we're
>>>> much more likely to hit the I/O limits (max_ios and various other limits
>>>> in read_stream_start_pending_read).
>>>
>>> But there's no OS prefetch with direct I/O. At most, there might be
>>> some kind of readahead implemented in the SSD's firmware.
>>>
>>
>> Good point, I keep forgetting direct I/O means no OS read-ahead. Not
>> sure if there's a good way to determine if the SSD can do something like
>> that (and how well). I wonder if there's a way to do backward sequential
>> scans in fio ..
> 
> In theory, yes, in practice, not quite:
> https://github.com/axboe/fio/issues/1963
> 
> So right now it only works if you skip over some blocks. For that there rather
> significant performance differences on my SSDs. E.g.
> 
> andres@awork3:~/src/fio$ fio --directory /srv/fio --size=$((1024*1024*1024)) 
> --name test --bs=4k --rw read:8k --buffered 0 2>&1|grep READ
>    READ: bw=179MiB/s (188MB/s), 179MiB/s-179MiB/s (188MB/s-188MB/s), 
> io=341MiB (358MB), run=1907-1907msec
> andres@awork3:~/src/fio$ fio --directory /srv/fio --size=$((1024*1024*1024)) 
> --name test --bs=4k --rw read:-8k --buffered 0 2>&1|grep READ
>    READ: bw=70.6MiB/s (74.0MB/s), 70.6MiB/s-70.6MiB/s (74.0MB/s-74.0MB/s), 
> io=1024MiB (1074MB), run=14513-14513msec
> 
> So on this WD Red SN700 there's a rather substantial performance difference.
> 
> On a Samsung 970 PRO I don't see much of a difference. Nor on a ADATA
> SX8200PNP.
> 

I experimented with this a little bit today. Given the fio issues, I
ended up writing a simple tool in C, doing pread() forward/backward with
different block size and direct I/O. AFAICS this is roughly equivalent
to fio with iodepth=1 (based on a couple tests).

Too bad fio has issues with backward sequential tests ... I'll see if I
can get at least some results to validate my results.

On all my SSDs there's massive difference between forward and backward
sequential scans. It depends on the block size, but for the smaller
block sizes (1-16KB) it's roughly 4x slower. It gets better for larger
blocks, but while that's interesting, we're stuck with 8K blocks.


FWIW I'm not claiming this explains all odd things we're investigating
in this thread, it's more a confirmation that the scan direction may
matter if it translates to direction at the device level. I don't think
it can explain the strange stuff with the "random" data sets constructed
Peter.


regards

-- 
Tomas Vondra

Attachment: seqscan.pdf
Description: Adobe PDF document

Attachment: seqscan-backward-vs-forward.pdf
Description: Adobe PDF document

/*
 * A simple test of forward/backward sequential scans with direct I/O.
 *
 * Build like this:
 *
 *		gcc -O2 -Werror -o direct-io-test direct-io-test.c
 *
 * Use like this:
 *
 * 		./direct-io-test /PATH/TO/FILE SIZE TIME_LIMIT
 *
 * with size in gigabytes (e.g. 32 means 32GB). The file will be created
 * and populated with random data. Then it runs a number of test with
 * different block sizes (1KB to 8MB) and directions. Can take a lot of
 * time (1h or more, depending on the storage device).
 */
#define _GNU_SOURCE

#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>

#include <sys/time.h>

/*
 * generate a file with size_gb size, fill it with random data
 */
void
generate_file(char *path, int size_gb)
{
/* longs to generate / write at once */
#define	BUFF_SIZE	1024

	long   *buff;
	size_t	buff_bytes = BUFF_SIZE * sizeof(long);
	long	size = (size_gb * 1024L * 1024L * 1024L);
	long	nwritten = 0;

	int		fd = open(path, (O_WRONLY | O_CREAT | O_DIRECT), (S_IRUSR | S_IRUSR));

	long	path_memalign = pathconf(path,_PC_REC_XFER_ALIGN);

	if (posix_memalign((void **) &buff, path_memalign, buff_bytes) != 0)
	{
		printf("posix_memalign failed (bytes %u memalign %ld)\n", buff_bytes, path_memalign);
		fflush(NULL);
		abort();
	}

	while (nwritten < size)
	{
		for (int i = 0; i < BUFF_SIZE; i++)
		{
			/* combine two 48-bit values into a 64-bit one */
			buff[i] = ((lrand48() << 16) ^ lrand48());
		}

		write(fd, buff, buff_bytes);

		nwritten += buff_bytes;
	}

	free(buff);

	fsync(fd);
	close(fd);
}

/*
 * Run forward/backward sequential scans, starting from 8MB blocks, and
 * going down to 1KB blocks. For each block test forward/backward scan.
 */
void
test_direct_io(char *path, int size_gb, int run, int run_limit)
{
	int		bs;	/* block size (in KB) */
	int		fd = open(path, (O_RDONLY | O_DIRECT), 0);
	long	size_kbs = (size_gb * 1024L * 1024L);
	long	size = (size_kbs * 1024L);

	long path_memalign = pathconf(path,_PC_REC_XFER_ALIGN);

	/* print header */
	printf("%16s%8s%8s%16s%8s%16s\n", "direction", "run", "block", "usec", "sec", "KB/s");
	fflush(NULL);

	/* start from largest block, go down to 1KB */
	bs = 8192;
	while (bs > 0)
	{
		char   *buff = NULL;
		long	block_bytes = (bs * 1024L);
		long	nblocks = (size / block_bytes);	/* total */
		long	nblocks_read;

#define MIN_CHECK_DISTANCE 1000

		long	nblocks_check;
		off_t	pos;

		/* timing */
		struct timeval	ts,
						te;
		long	usecs;
		double	secs;

		posix_memalign((void **) &buff, path_memalign, block_bytes);

		/* forward direction */
		gettimeofday(&ts, NULL);

		/* random starting position in the file (multiple of block size) */
		pos = (lrand48() % nblocks) * block_bytes;

		nblocks_read = 0;
		nblocks_check = MIN_CHECK_DISTANCE;
		while (nblocks_read < nblocks)
		{
			size_t	r = pread(fd, buff, block_bytes, pos);

			if (r == -1)
			{
				printf("pread failed %ld %m\n", r);
				fflush(NULL);
				abort();
			}
			else if (r < block_bytes)	/* incomplete read, just retry */
				continue;

			pos += block_bytes;

			if (pos >= size)
				pos = 0;

			nblocks_read++;

			/* maybe check time limit */
			if (nblocks_check == nblocks_read)
			{
				long	nblocks_expected;

				gettimeofday(&te, NULL);

				usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec);
				secs = (double) usecs / 1000000L;

				/* run exceeded limit */
				if (secs > run_limit)
					break;

				/* how many blocks we expect to read in the limit */
				nblocks_expected = run_limit * (nblocks_read / secs);
				nblocks_check = (nblocks_expected + nblocks_read) / 2;

				if (nblocks_check < nblocks_read + MIN_CHECK_DISTANCE)
					nblocks_check = nblocks_read + MIN_CHECK_DISTANCE;
			}
		}

		gettimeofday(&te, NULL);

		usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec);
		secs = (double) usecs / 1000000L;

		/* print info about the forward scan */
		printf("%16s%8d%8d%16ld%8.2f%16.2f\n", "forward", run, bs, usecs, secs,
			   (nblocks_read * block_bytes / 1024L) / secs);
		fflush(NULL);

		/* backward direction */
		gettimeofday(&ts, NULL);

		/* random starting position in the file (multiple of block size) */
		pos = (lrand48() % nblocks) * block_bytes;

		nblocks_read = 0;
		nblocks_check = MIN_CHECK_DISTANCE;
		while (nblocks_read < nblocks)	/* number of blocks remaining */
		{
			size_t r = pread(fd, buff, block_bytes, pos);

			if (r == -1)
			{
				printf("pread failed %ld %m\n", r);
				fflush(NULL);
				abort();
			}
			else if (r < block_bytes)	/* incomplete read, just retry */
			{
				printf("retry write\n");
				fflush(NULL);
				continue;
			}

			pos -= block_bytes;

			if (pos < 0)
				pos = size - block_bytes;

			nblocks_read++;

			/* maybe check time limit */
			if (nblocks_check == nblocks_read)
			{
				long	nblocks_expected;

				gettimeofday(&te, NULL);

				usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec);
				secs = (double) usecs / 1000000L;

				/* run exceeded limit */
				if (secs > run_limit)
					break;

				/* how many blocks we expect to read in the limit */
				nblocks_expected = run_limit * (nblocks_read / secs);
				nblocks_check = (nblocks_expected + nblocks_read) / 2;

				if (nblocks_check < nblocks_read + MIN_CHECK_DISTANCE)
					nblocks_check = nblocks_read + MIN_CHECK_DISTANCE;
			}
		}

		gettimeofday(&te, NULL);

		usecs = (te.tv_sec - ts.tv_sec) * 1000000L + (te.tv_usec - ts.tv_usec);
		secs = (double) usecs / 1000000L;

		/* print info about the backward scan */
		printf("%16s%8d%8d%16ld%8.2f%16.2f\n", "backward", run, bs, usecs, secs,
			   (nblocks_read * block_bytes / 1024L) / secs);
		fflush(NULL);

		free(buff);

		/* proceed to smaller block */
		bs /= 2;
	}

	close(fd);
}

int
main(int argc, char **argv)
{
	char *path = argv[1];
	int	size_gb = atoi(argv[2]);
	int	runs = atoi(argv[3]);
	int	limit = atoi(argv[4]);

	printf("path = %s\n", path);
	printf("size = %d GB\n", size_gb);
	printf("runs = %d\n", runs);
	printf("limit = %d\n", limit);

	fflush(NULL);

	generate_file(path, size_gb);

	printf("file generated\n");
	fflush(NULL);

	/* */
	for (int r = 1; r <= runs; r++)
		test_direct_io(path, size_gb, r, limit);

	return 0;
}

Reply via email to