Hi hackers,
Right now function WaitLatchOrSocket is implemented in very inefficient
way: for each invocation it creates epoll instance, registers events and
then closes this instance.
Certainly it is possible to create wait event set once with
CreateWaitEventSet and then use WaitEventSetWait.
And it is done in most performance critical places.
But there are still lot of places in Postgres where WaitLatchOrSocket or
WaitLatch are used.
One of them is postgres_fdw.
If we run pgbench through postgres_fdw (just redirect pgbench tables
using postgres_fdw to the localhost),
then at the system with large number (72) of CPU cores, "pgbench -S -M
prepared -c 100 -j 32" shows performance about 38k TPS with the
following profile:
- 73.83% 0.12% postgres postgres [.] PostgresMain ▒
- 73.82% PostgresMain ▒
+ 28.18% PortalRun ▒
+ 26.48% finish_xact_command ▒
+ 13.66% PortalStart ▒
+ 1.83% exec_simple_query ▒
+ 1.22% pq_getbyte ▒
+ 0.89% ReadyForQuery ▒
- 66.04% 0.03% postgres [kernel.kallsyms] [k]
entry_SYSCALL_64_fastpath ▒
- 66.01% entry_SYSCALL_64_fastpath ▒
+ 61.39% syscall_return_slowpath ▒
+ 1.52% sys_epoll_create1 ▒
+ 1.30% SYSC_sendto ▒
+ 0.94% sys_epoll_pwait ▒
0.57% SYSC_recvfrom ▒
- 65.61% 0.03% postgres postgres_fdw.so [.]
pgfdw_get_result ▒
- 65.58% pgfdw_get_result ▒
- 65.00% WaitLatchOrSocket ▒
+ 62.60% __close ▒
+ 1.62% CreateWaitEventSet ▒
- 65.09% 0.02% postgres postgres [.] WaitLatchOrSocket ▒
- 65.08% WaitLatchOrSocket ▒
+ 62.68% __close ▒
+ 1.62% CreateWaitEventSet ▒
+ 62.69% 0.02% postgres libpthread-2.26.so [.] __close ▒
So, you can see that more than 60% of CPU is spent in close.
If we cache used wait event sets, then performance is increased to 225k
TPS: five times!
At the systems with smaller number of cores effect of this patch is not
so large: at my desktop with 4 cores I get just about 10% improvement at
the same test.
There are two possible ways of fixing this issue:
1. Patch postgres_fdw to store WaitEventSet in connection.
2. Patch WaitLatchOrSocket to cache created wait event sets.
Second approach is more generic and cover all cases of WaitLatch usages.
Attached patch implements with approach.
The most challenging of this approach is using socket descriptor as
part of hash code.
Socket can be closed be already and reused, so cached epoll set will not
work any more.
To solve this issue I always try to add socket to the epoll set,
ignoring EEXIST error.
Certainly it will add some extra overhead, but looks like it is
negligible comparing with overhead of close (if I comment this branch,
then pgbench performance is almost the same - 227k TPS).
But if there are some other arguments against using cache in
WaitLatchOrSocket, we have a patch particularly for postgres_fdw.
--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index e6706f7..b9cdead 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -51,6 +51,7 @@
#include "storage/latch.h"
#include "storage/pmsignal.h"
#include "storage/shmem.h"
+#include "utils/memutils.h"
/*
* Select the fd readiness primitive to use. Normally the "most modern"
@@ -340,6 +341,15 @@ WaitLatch(volatile Latch *latch, int wakeEvents, long timeout,
wait_event_info);
}
+#define WAIT_EVENT_HASH_SIZE 113
+
+typedef struct WaitEventHashEntry
+{
+ WaitEventSet* set;
+ int wakeEvents;
+ pgsocket sock;
+} WaitEventHashEntry;
+
/*
* Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
* conditions.
@@ -359,29 +369,65 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
int ret = 0;
int rc;
WaitEvent event;
- WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
+ WaitEventSet *set;
+ static WaitEventHashEntry wait_event_hash[WAIT_EVENT_HASH_SIZE];
+ size_t h = (wakeEvents | ((size_t)sock << 6)) % WAIT_EVENT_HASH_SIZE;
+ set = wait_event_hash[h].set;
+ if (set == NULL || wait_event_hash[h].wakeEvents != wakeEvents || wait_event_hash[h].sock != sock)
+ {
+ if (set)
+ FreeWaitEventSet(set);
+ set = CreateWaitEventSet(TopMemoryContext, 3);
+ wait_event_hash[h].set = set;
+ wait_event_hash[h].wakeEvents = wakeEvents;
+ wait_event_hash[h].sock = sock;
+
+ if (wakeEvents & WL_LATCH_SET)
+ AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
+ (Latch *) latch, NULL);
+
+ if (wakeEvents & WL_POSTMASTER_DEATH && IsUnderPostmaster)
+ AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
+
+ if (wakeEvents & WL_SOCKET_MASK)
+ {
+ int ev;
+
+ ev = wakeEvents & WL_SOCKET_MASK;
+ AddWaitEventToSet(set, ev, sock, NULL, NULL);
+ }
+ }
+#if defined(WAIT_USE_EPOLL)
+ /*
+ * Socket descriptor can be closed and reused.
+ * Closed descriptor is automatically excluded from epoll set,
+ * but cached epoll set will not correctly work in this case.
+ * So we will try to always add socket descriptor and ignore EEXIST error.
+ */
+ else if (wakeEvents & WL_SOCKET_MASK)
+ {
+ struct epoll_event epoll_ev;
+ int rc;
+ epoll_ev.data.ptr = &set->events[set->nevents-1];
+ epoll_ev.events = EPOLLERR | EPOLLHUP;
+ if (wakeEvents & WL_SOCKET_READABLE)
+ epoll_ev.events |= EPOLLIN;
+ if (wakeEvents & WL_SOCKET_WRITEABLE)
+ epoll_ev.events |= EPOLLOUT;
+ rc = epoll_ctl(set->epoll_fd, EPOLL_CTL_ADD, sock, &epoll_ev);
+ if (rc < 0 && errno != EEXIST)
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("epoll_ctl() failed: %m")));
+ }
+#endif
if (wakeEvents & WL_TIMEOUT)
Assert(timeout >= 0);
else
timeout = -1;
- if (wakeEvents & WL_LATCH_SET)
- AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
- (Latch *) latch, NULL);
-
- if (wakeEvents & WL_POSTMASTER_DEATH && IsUnderPostmaster)
- AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
- NULL, NULL);
-
- if (wakeEvents & WL_SOCKET_MASK)
- {
- int ev;
-
- ev = wakeEvents & WL_SOCKET_MASK;
- AddWaitEventToSet(set, ev, sock, NULL, NULL);
- }
-
rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
if (rc == 0)
@@ -392,9 +438,6 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
WL_POSTMASTER_DEATH |
WL_SOCKET_MASK);
}
-
- FreeWaitEventSet(set);
-
return ret;
}