On Tue, Jun 21, 2022 at 10:33 PM Jakub Wartak <jakub.war...@tomtom.com> wrote:
> > > Maybe the important question is why would be readahead mechanism be
> > disabled in the first place via /sys | blockdev ?
> >
> > Because database should know better than OS which data needs to be
> > prefetched and which should not. Big OS readahead affects index scan
> > performance.
>
> OK fair point, however the patch here is adding 1 syscall per XLOG_BLCKSZ 
> which is not cheap either. The code is already hot and there is example from 
> the past where syscalls were limiting the performance [1]. Maybe it could be 
> prefetching in larger batches (128kB? 1MB? 16MB?)  ?

I've always thought we'd want to tell it about the *next* segment
file, to smooth the transition from one file to the next, something
like the attached (not tested).
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 6eba626420..e1dc37d3c2 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -4032,6 +4032,25 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
 }
 
 
+/*
+ * Tell the kernel to prefetch a logfile segment, if it exists.  Ignores
+ * errors, since this is only a hint.
+ */
+static void
+XLogFilePrefetch(XLogSegNo segno, TimeLineID tli)
+{
+	char		path[MAXPGPATH];
+	File		file;
+
+	XLogFilePath(path, tli, segno, wal_segment_size);
+	file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
+	if (file >= 0)
+	{
+		FilePrefetch(file, 0, 0, WAIT_EVENT_WAL_PREFETCH);
+		FileClose(file);
+	}
+}
+
 /*
  * Open a logfile segment for reading (during recovery).
  *
@@ -4106,6 +4125,13 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 		if (source != XLOG_FROM_STREAM)
 			XLogReceiptTime = GetCurrentTimestamp();
 
+		/*
+		 * Every time we open a file from pg_wal, hint to the kernel that we'll
+		 * likely soon be reading the next segment.
+		 */
+		if (readSource == XLOG_FROM_PG_WAL)
+			XLogFilePrefetch(segno + 1, tli);
+
 		return fd;
 	}
 	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 87c15b9c6f..f45d32ceef 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -729,6 +729,9 @@ pgstat_get_wait_io(WaitEventIO w)
 		case WAIT_EVENT_WAL_INIT_WRITE:
 			event_name = "WALInitWrite";
 			break;
+		case WAIT_EVENT_WAL_PREFETCH:
+			event_name = "WALPrefetch";
+			break;
 		case WAIT_EVENT_WAL_READ:
 			event_name = "WALRead";
 			break;
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index b578e2ec75..1473a4388f 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -226,6 +226,7 @@ typedef enum
 	WAIT_EVENT_WAL_COPY_WRITE,
 	WAIT_EVENT_WAL_INIT_SYNC,
 	WAIT_EVENT_WAL_INIT_WRITE,
+	WAIT_EVENT_WAL_PREFETCH,
 	WAIT_EVENT_WAL_READ,
 	WAIT_EVENT_WAL_SYNC,
 	WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,

Reply via email to