A bit cleaner version of a patch.
Sokolov Yura писал 2017-05-25 15:22:
Good day, everyone.
I've been played with pgbench on huge machine.
(72 cores, 56 for postgresql, enough memory to fit base
both into shared_buffers and file cache)
(pgbench scale 500, unlogged tables, fsync=off,
synchronous commit=off, wal_writer_flush_after=0).
With 200 clients performance is around 76000tps and main
bottleneck in this dumb test is LWLockWaitListLock.
I added gcc specific implementation for pg_atomic_fetch_or_u32_impl
(ie using __sync_fetch_and_or) and performance became 83000tps.
It were a bit strange at a first look, cause __sync_fetch_and_or
compiles to almost same CAS loop.
Looking closely, I noticed that intrinsic performs doesn't do
read in the loop body, but at loop initialization. It is correct
behavior cause `lock cmpxchg` instruction stores old value in EAX
register.
It is expected behavior, and pg_compare_and_exchange_*_impl does
the same in all implementations. So there is no need to re-read
value in the loop body:
Example diff for pg_atomic_exchange_u32_impl:
static inline uint32
pg_atomic_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, uint32
xchg_)
{
uint32 old;
+ old = pg_atomic_read_u32_impl(ptr);
while (true)
{
- old = pg_atomic_read_u32_impl(ptr);
if (pg_atomic_compare_exchange_u32_impl(ptr, &old, xchg_))
break;
}
return old;
}
After applying this change to all generic atomic functions
(and for pg_atomic_fetch_or_u32_impl ), performance became
equal to __sync_fetch_and_or intrinsic.
Attached patch contains patch for all generic atomic
functions, and also __sync_fetch_and_(or|and) for gcc, cause
I believe GCC optimize code around intrinsic better than
around inline assembler.
(final performance is around 86000tps, but difference between
83000tps and 86000tps is not so obvious in NUMA system).
With regards,
--
Sokolov Yura aka funny_falcon
Postgres Professional: https://postgrespro.ru
The Russian Postgres Company
From 2c58a32ca89867e7d244e7037a38aa9ccc2b92c2 Mon Sep 17 00:00:00 2001
From: Sokolov Yura <funny.fal...@postgrespro.ru>
Date: Thu, 25 May 2017 14:54:45 +0300
Subject: [PATCH] Fix performance of Atomics generic implementation
pg_atomic_compare_exchange_*_impl modifies value of `old` pointer,
so there is no need to reread value in loop body.
Also add gcc specific __sync_fetch_and_(or|and), cause looks like
compiler may optimize code around intrisic better than around assembler,
although intrisic is compiled to almost same CAS loop.
---
src/include/port/atomics/generic-gcc.h | 36 +++++++++++++++++++
src/include/port/atomics/generic.h | 64 +++++++++-------------------------
2 files changed, 52 insertions(+), 48 deletions(-)
diff --git a/src/include/port/atomics/generic-gcc.h b/src/include/port/atomics/generic-gcc.h
index 7efc0861e7..79ada94047 100644
--- a/src/include/port/atomics/generic-gcc.h
+++ b/src/include/port/atomics/generic-gcc.h
@@ -185,6 +185,24 @@ pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
}
#endif
+#if !defined(PG_HAVE_ATOMIC_FETCH_AND_U32) && defined(HAVE_GCC__SYNC_INT32_CAS)
+#define PG_HAVE_ATOMIC_FETCH_AND_U32
+static inline uint32
+pg_atomic_fetch_and_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 and_)
+{
+ return __sync_fetch_and_and(&ptr->value, and_);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_OR_U32) && defined(HAVE_GCC__SYNC_INT32_CAS)
+#define PG_HAVE_ATOMIC_FETCH_OR_U32
+static inline uint32
+pg_atomic_fetch_or_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 or_)
+{
+ return __sync_fetch_and_or(&ptr->value, or_);
+}
+#endif
+
#if !defined(PG_DISABLE_64_BIT_ATOMICS)
@@ -223,6 +241,24 @@ pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
}
#endif
+#if !defined(PG_HAVE_ATOMIC_FETCH_AND_U64) && defined(HAVE_GCC__SYNC_INT64_CAS)
+#define PG_HAVE_ATOMIC_FETCH_AND_U64
+static inline uint64
+pg_atomic_fetch_and_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 and_)
+{
+ return __sync_fetch_and_and(&ptr->value, and_);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_OR_U64) && defined(HAVE_GCC__SYNC_INT64_CAS)
+#define PG_HAVE_ATOMIC_FETCH_OR_U64
+static inline uint64
+pg_atomic_fetch_or_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 or_)
+{
+ return __sync_fetch_and_or(&ptr->value, or_);
+}
+#endif
+
#endif /* !defined(PG_DISABLE_64_BIT_ATOMICS) */
#endif /* defined(HAVE_ATOMICS) */
diff --git a/src/include/port/atomics/generic.h b/src/include/port/atomics/generic.h
index 424543604a..6d2671beab 100644
--- a/src/include/port/atomics/generic.h
+++ b/src/include/port/atomics/generic.h
@@ -170,12 +170,8 @@ static inline uint32
pg_atomic_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 xchg_)
{
uint32 old;
- while (true)
- {
- old = pg_atomic_read_u32_impl(ptr);
- if (pg_atomic_compare_exchange_u32_impl(ptr, &old, xchg_))
- break;
- }
+ old = pg_atomic_read_u32_impl(ptr);
+ while (!pg_atomic_compare_exchange_u32_impl(ptr, &old, xchg_));
return old;
}
#endif
@@ -186,12 +182,8 @@ static inline uint32
pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
{
uint32 old;
- while (true)
- {
- old = pg_atomic_read_u32_impl(ptr);
- if (pg_atomic_compare_exchange_u32_impl(ptr, &old, old + add_))
- break;
- }
+ old = pg_atomic_read_u32_impl(ptr);
+ while (!pg_atomic_compare_exchange_u32_impl(ptr, &old, old + add_));
return old;
}
#endif
@@ -211,12 +203,8 @@ static inline uint32
pg_atomic_fetch_and_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 and_)
{
uint32 old;
- while (true)
- {
- old = pg_atomic_read_u32_impl(ptr);
- if (pg_atomic_compare_exchange_u32_impl(ptr, &old, old & and_))
- break;
- }
+ old = pg_atomic_read_u32_impl(ptr);
+ while (!pg_atomic_compare_exchange_u32_impl(ptr, &old, old & and_));
return old;
}
#endif
@@ -227,12 +215,8 @@ static inline uint32
pg_atomic_fetch_or_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 or_)
{
uint32 old;
- while (true)
- {
- old = pg_atomic_read_u32_impl(ptr);
- if (pg_atomic_compare_exchange_u32_impl(ptr, &old, old | or_))
- break;
- }
+ old = pg_atomic_read_u32_impl(ptr);
+ while (!pg_atomic_compare_exchange_u32_impl(ptr, &old, old | or_));
return old;
}
#endif
@@ -261,12 +245,8 @@ static inline uint64
pg_atomic_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 xchg_)
{
uint64 old;
- while (true)
- {
- old = ptr->value;
- if (pg_atomic_compare_exchange_u64_impl(ptr, &old, xchg_))
- break;
- }
+ old = ptr->value;
+ while (!pg_atomic_compare_exchange_u64_impl(ptr, &old, xchg_));
return old;
}
#endif
@@ -357,12 +337,8 @@ static inline uint64
pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
{
uint64 old;
- while (true)
- {
- old = pg_atomic_read_u64_impl(ptr);
- if (pg_atomic_compare_exchange_u64_impl(ptr, &old, old + add_))
- break;
- }
+ old = pg_atomic_read_u64_impl(ptr);
+ while (!pg_atomic_compare_exchange_u64_impl(ptr, &old, old + add_));
return old;
}
#endif
@@ -382,12 +358,8 @@ static inline uint64
pg_atomic_fetch_and_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 and_)
{
uint64 old;
- while (true)
- {
- old = pg_atomic_read_u64_impl(ptr);
- if (pg_atomic_compare_exchange_u64_impl(ptr, &old, old & and_))
- break;
- }
+ old = pg_atomic_read_u64_impl(ptr);
+ while (!pg_atomic_compare_exchange_u64_impl(ptr, &old, old & and_));
return old;
}
#endif
@@ -398,12 +370,8 @@ static inline uint64
pg_atomic_fetch_or_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 or_)
{
uint64 old;
- while (true)
- {
- old = pg_atomic_read_u64_impl(ptr);
- if (pg_atomic_compare_exchange_u64_impl(ptr, &old, old | or_))
- break;
- }
+ old = pg_atomic_read_u64_impl(ptr);
+ while (!pg_atomic_compare_exchange_u64_impl(ptr, &old, old | or_));
return old;
}
#endif
--
2.11.0
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers