On Tue, Aug 9, 2022 at 10:51 AM Nathan Bossart <nathandboss...@gmail.com> wrote:
> Fixed in v10.
I decided I wasn't quite comfortable changing snapshot handling
without further guarantees. To this end, 0002 in the attached v11 is
an addendum that adds assert checking (also pgindent and some
comment-smithing). As I suspected, make check-world passes even with
purposefully screwed-up coding. 0003 uses pg_lfind32 in syscache.c and
I verified that sticking in the wrong answer will lead to a crash in
assert-enabled builds in short order. I'd kind of like to throw this
(or something else suitable) at the build farm first for that reason.
It's simpler than the qsort/qunique/binary search that was there
before, so that's nice, but I've not tried to test performance.
--
John Naylor
EDB: http://www.enterprisedb.com
From 1c89b9b7d3c71bb1a703caaf7c01c93bc9e2515f Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 9 Aug 2022 11:51:52 +0700
Subject: [PATCH v11 2/4] Add assert checking, run pgindent, comment changes
---
src/include/port/pg_lfind.h | 78 ++++++++++++++++++++++++++-----------
1 file changed, 56 insertions(+), 22 deletions(-)
diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index 24de544f63..fb125977b2 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -18,51 +18,85 @@
/*
* pg_lfind32
*
- * Returns true if there is an element in 'base' that equals 'key'. Otherwise,
- * returns false.
+ * Return true if there is an element in 'base' that equals 'key', otherwise
+ * return false.
*/
static inline bool
pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
{
uint32 i = 0;
- /* If possible, use SSE2 intrinsics to speed up the search. */
+ /* Use SIMD intrinsics where available. */
#ifdef USE_SSE2
- const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */
- uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */
- for (; i < iterations; i += 16)
+ /*
+ * A 16-byte register only has four 4-byte lanes. For better
+ * instruction-level parallelism, each loop iteration operates on a block
+ * of four registers. Testing has showed this is ~40% faster than using a
+ * block of two registers.
+ */
+ const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */
+ uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */
+
+#if defined(USE_ASSERT_CHECKING)
+ bool assert_result = false;
+
+ /* pre-compute the result for assert checking */
+ for (i = 0; i < nelem; i++)
{
- /* load the next 16 values into __m128i variables */
- const __m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]);
- const __m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]);
- const __m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]);
- const __m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]);
+ if (key == base[i])
+ {
+ assert_result = true;
+ break;
+ }
+ }
+#endif
- /* perform the comparisons */
- const __m128i result1 = _mm_cmpeq_epi32(keys, vals1);
- const __m128i result2 = _mm_cmpeq_epi32(keys, vals2);
- const __m128i result3 = _mm_cmpeq_epi32(keys, vals3);
- const __m128i result4 = _mm_cmpeq_epi32(keys, vals4);
+ for (i = 0; i < iterations; i += 16)
+ {
+ /* load the next block into 4 registers holding 4 values each */
+ const __m128i vals1 = _mm_loadu_si128((__m128i *) & base[i]);
+ const __m128i vals2 = _mm_loadu_si128((__m128i *) & base[i + 4]);
+ const __m128i vals3 = _mm_loadu_si128((__m128i *) & base[i + 8]);
+ const __m128i vals4 = _mm_loadu_si128((__m128i *) & base[i + 12]);
- /* shrink the results into a single variable */
- const __m128i tmp1 = _mm_or_si128(result1, result2);
- const __m128i tmp2 = _mm_or_si128(result3, result4);
- const __m128i result = _mm_or_si128(tmp1, tmp2);
+ /* compare each value to the key */
+ const __m128i result1 = _mm_cmpeq_epi32(keys, vals1);
+ const __m128i result2 = _mm_cmpeq_epi32(keys, vals2);
+ const __m128i result3 = _mm_cmpeq_epi32(keys, vals3);
+ const __m128i result4 = _mm_cmpeq_epi32(keys, vals4);
+
+ /* combine the results into a single variable */
+ const __m128i tmp1 = _mm_or_si128(result1, result2);
+ const __m128i tmp2 = _mm_or_si128(result3, result4);
+ const __m128i result = _mm_or_si128(tmp1, tmp2);
/* see if there was a match */
if (_mm_movemask_epi8(result) != 0)
+ {
+#if defined(USE_ASSERT_CHECKING)
+ Assert(assert_result == true);
+#endif
return true;
+ }
}
-#endif
+#endif /* USE_SSE2 */
- /* Process the remaining elements the slow way. */
+ /* Process the remaining elements one at a time. */
for (; i < nelem; i++)
{
if (key == base[i])
+ {
+#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING)
+ Assert(assert_result == true);
+#endif
return true;
+ }
}
+#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING)
+ Assert(assert_result == false);
+#endif
return false;
}
--
2.36.1
From ff77224f9227bcff88a68e63f39754296810351c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Wed, 3 Aug 2022 09:59:28 -0700
Subject: [PATCH v11 4/4] Optimize linear searches in XidInMVCCSnapshot().
This change makes use of the recently-introduced optimized linear search
routine to speed up searches through the [sub]xip arrays when possible, which
should improve performance significantly when the arrays are large.
Author: Nathan Bossart
Reviewed by: Andres Freund, John Naylor, Bharath Rupireddy, Masahiko Sawada
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
---
src/backend/utils/time/snapmgr.c | 28 +++++++---------------------
1 file changed, 7 insertions(+), 21 deletions(-)
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 5bc2a15160..9b504c9745 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -56,6 +56,7 @@
#include "datatype/timestamp.h"
#include "lib/pairingheap.h"
#include "miscadmin.h"
+#include "port/pg_lfind.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
@@ -2284,8 +2285,6 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
bool
XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
{
- uint32 i;
-
/*
* Make a quick range check to eliminate most XIDs without looking at the
* xip arrays. Note that this is OK even if we convert a subxact XID to
@@ -2317,13 +2316,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
if (!snapshot->suboverflowed)
{
/* we have full data, so search subxip */
- int32 j;
-
- for (j = 0; j < snapshot->subxcnt; j++)
- {
- if (TransactionIdEquals(xid, snapshot->subxip[j]))
- return true;
- }
+ if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt))
+ return true;
/* not there, fall through to search xip[] */
}
@@ -2344,16 +2338,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
return false;
}
- for (i = 0; i < snapshot->xcnt; i++)
- {
- if (TransactionIdEquals(xid, snapshot->xip[i]))
- return true;
- }
+ if (pg_lfind32(xid, snapshot->xip, snapshot->xcnt))
+ return true;
}
else
{
- int32 j;
-
/*
* In recovery we store all xids in the subxact array because it is by
* far the bigger array, and we mostly don't know which xids are
@@ -2383,11 +2372,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
* indeterminate xid. We don't know whether it's top level or subxact
* but it doesn't matter. If it's present, the xid is visible.
*/
- for (j = 0; j < snapshot->subxcnt; j++)
- {
- if (TransactionIdEquals(xid, snapshot->subxip[j]))
- return true;
- }
+ if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt))
+ return true;
}
return false;
--
2.36.1
From 216415cff82aef98c02d9e953f956e8311b454d8 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Wed, 3 Aug 2022 09:49:04 -0700
Subject: [PATCH v11 1/4] Introduce optimized routine for linear searches
through an array of integers.
If SSE2 is available, this function uses it to speed up the search. Otherwise,
it uses a simple 'for' loop. This is a prerequisite for a follow-up commit
that will use this function to optimize [sub]xip lookups in
XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such
an optimization.
It might be worthwhile to add an ARM-specific code path to this function in the
future.
Author: Nathan Bossart
Reviewed by: Andres Freund, John Naylor, Bharath Rupireddy, Masahiko Sawada
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
---
src/include/port/pg_lfind.h | 69 +++++++++++++++++++++++++++++++++++++
1 file changed, 69 insertions(+)
create mode 100644 src/include/port/pg_lfind.h
diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
new file mode 100644
index 0000000000..24de544f63
--- /dev/null
+++ b/src/include/port/pg_lfind.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_lfind.h
+ * Optimized linear search routines.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/include/port/pg_lfind.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_LFIND_H
+#define PG_LFIND_H
+
+#include "port/simd.h"
+
+/*
+ * pg_lfind32
+ *
+ * Returns true if there is an element in 'base' that equals 'key'. Otherwise,
+ * returns false.
+ */
+static inline bool
+pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
+{
+ uint32 i = 0;
+
+ /* If possible, use SSE2 intrinsics to speed up the search. */
+#ifdef USE_SSE2
+ const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */
+ uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */
+
+ for (; i < iterations; i += 16)
+ {
+ /* load the next 16 values into __m128i variables */
+ const __m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]);
+ const __m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]);
+ const __m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]);
+ const __m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]);
+
+ /* perform the comparisons */
+ const __m128i result1 = _mm_cmpeq_epi32(keys, vals1);
+ const __m128i result2 = _mm_cmpeq_epi32(keys, vals2);
+ const __m128i result3 = _mm_cmpeq_epi32(keys, vals3);
+ const __m128i result4 = _mm_cmpeq_epi32(keys, vals4);
+
+ /* shrink the results into a single variable */
+ const __m128i tmp1 = _mm_or_si128(result1, result2);
+ const __m128i tmp2 = _mm_or_si128(result3, result4);
+ const __m128i result = _mm_or_si128(tmp1, tmp2);
+
+ /* see if there was a match */
+ if (_mm_movemask_epi8(result) != 0)
+ return true;
+ }
+#endif
+
+ /* Process the remaining elements the slow way. */
+ for (; i < nelem; i++)
+ {
+ if (key == base[i])
+ return true;
+ }
+
+ return false;
+}
+
+#endif /* PG_LFIND_H */
--
2.36.1
From f714fa6fdb860c89b3bb8bc54465a43e24bf9496 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 9 Aug 2022 12:37:22 +0700
Subject: [PATCH v11 3/4] Use optimized linear search for testing if relations
have a syscache
XXX: not for commit just yet
---
src/backend/utils/cache/syscache.c | 69 +++---------------------------
1 file changed, 5 insertions(+), 64 deletions(-)
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 1912b12146..d354161c54 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -76,6 +76,7 @@
#include "catalog/pg_type.h"
#include "catalog/pg_user_mapping.h"
#include "lib/qunique.h"
+#include "port/pg_lfind.h"
#include "utils/catcache.h"
#include "utils/rel.h"
#include "utils/syscache.h"
@@ -1044,16 +1045,14 @@ static CatCache *SysCache[SysCacheSize];
static bool CacheInitialized = false;
-/* Sorted array of OIDs of tables that have caches on them */
+/* Array of OIDs of tables that have caches on them */
static Oid SysCacheRelationOid[SysCacheSize];
static int SysCacheRelationOidSize;
-/* Sorted array of OIDs of tables and indexes used by caches */
+/* Array of OIDs of tables and indexes used by caches */
static Oid SysCacheSupportingRelOid[SysCacheSize * 2];
static int SysCacheSupportingRelOidSize;
-static int oid_compare(const void *a, const void *b);
-
/*
* InitCatalogCache - initialize the caches
@@ -1100,19 +1099,6 @@ InitCatalogCache(void)
Assert(SysCacheRelationOidSize <= lengthof(SysCacheRelationOid));
Assert(SysCacheSupportingRelOidSize <= lengthof(SysCacheSupportingRelOid));
- /* Sort and de-dup OID arrays, so we can use binary search. */
- pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
- sizeof(Oid), oid_compare);
- SysCacheRelationOidSize =
- qunique(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid),
- oid_compare);
-
- pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
- sizeof(Oid), oid_compare);
- SysCacheSupportingRelOidSize =
- qunique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
- sizeof(Oid), oid_compare);
-
CacheInitialized = true;
}
@@ -1552,22 +1538,7 @@ RelationInvalidatesSnapshotsOnly(Oid relid)
bool
RelationHasSysCache(Oid relid)
{
- int low = 0,
- high = SysCacheRelationOidSize - 1;
-
- while (low <= high)
- {
- int middle = low + (high - low) / 2;
-
- if (SysCacheRelationOid[middle] == relid)
- return true;
- if (SysCacheRelationOid[middle] < relid)
- low = middle + 1;
- else
- high = middle - 1;
- }
-
- return false;
+ return pg_lfind32(relid, SysCacheRelationOid, SysCacheRelationOidSize);
}
/*
@@ -1577,35 +1548,5 @@ RelationHasSysCache(Oid relid)
bool
RelationSupportsSysCache(Oid relid)
{
- int low = 0,
- high = SysCacheSupportingRelOidSize - 1;
-
- while (low <= high)
- {
- int middle = low + (high - low) / 2;
-
- if (SysCacheSupportingRelOid[middle] == relid)
- return true;
- if (SysCacheSupportingRelOid[middle] < relid)
- low = middle + 1;
- else
- high = middle - 1;
- }
-
- return false;
-}
-
-
-/*
- * OID comparator for pg_qsort
- */
-static int
-oid_compare(const void *a, const void *b)
-{
- Oid oa = *((const Oid *) a);
- Oid ob = *((const Oid *) b);
-
- if (oa == ob)
- return 0;
- return (oa > ob) ? 1 : -1;
+ return pg_lfind32(relid, SysCacheSupportingRelOid, SysCacheSupportingRelOidSize);
}
--
2.36.1