v3 applies on top of the v9 json_lex_string patch in [1] and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.
[1]
https://www.postgresql.org/message-id/CAFBsxsFV4v802idV0-Bo%3DV7wLMHRbOZ4er0hgposhyGCikmVGA%40mail.gmail.com
--
John Naylor
EDB: http://www.enterprisedb.com
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..1ca7533f00 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,11 +1918,12 @@ pg_utf8_verifystr(const unsigned char *s, int len)
const int orig_len = len;
uint32 state = BGN;
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
+ /*
+ * With a stride of two vector widths, gcc will unroll the loop. Even if
+ * the compiler can unroll a longer loop, it's not worth it because we
+ * must fall back to the byte-wise algorithm if we find any non-ASCII.
+ */
+#define STRIDE_LENGTH (2 * sizeof(Vector8))
if (len >= STRIDE_LENGTH)
{
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 011b0b3abd..aea045aa66 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -19,6 +19,8 @@
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
+#include "port/simd.h"
+
/*
* The pg_wchar type
*/
@@ -704,25 +706,28 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
* Verify a chunk of bytes for valid ASCII.
*
* Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
*/
static inline bool
is_valid_ascii(const unsigned char *s, int len)
{
const unsigned char *const s_end = s + len;
- uint64 chunk,
- highbit_cum = UINT64CONST(0),
- zero_cum = UINT64CONST(0x8080808080808080);
+ Vector8 chunk;
+ Vector8 highbit_cum = vector8_broadcast(0);
+#ifdef USE_NO_SIMD
+ Vector8 zero_cum = vector8_broadcast(0x80);
+#endif
Assert(len % sizeof(chunk) == 0);
while (s < s_end)
{
- memcpy(&chunk, s, sizeof(chunk));
+ vector8_load(&chunk, s);
+
+ /* Capture any zero bytes in this chunk. */
+#if defined(USE_NO_SIMD)
/*
- * Capture any zero bytes in this chunk.
- *
* First, add 0x7f to each byte. This sets the high bit in each byte,
* unless it was a zero. If any resulting high bits are zero, the
* corresponding high bits in the zero accumulator will be cleared.
@@ -734,20 +739,31 @@ is_valid_ascii(const unsigned char *s, int len)
* because we check for those separately.
*/
zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+#else
+
+ /*
+ * Set all bits in each lane of the highbit accumulator where input
+ * bytes are zero.
+ */
+ highbit_cum = vector8_or(highbit_cum,
+ vector8_eq(chunk, vector8_broadcast(0)));
+#endif
/* Capture all set bits in this chunk. */
- highbit_cum |= chunk;
+ highbit_cum = vector8_or(highbit_cum, chunk);
s += sizeof(chunk);
}
/* Check if any high bits in the high bit accumulator got set. */
- if (highbit_cum & UINT64CONST(0x8080808080808080))
+ if (vector8_is_highbit_set(highbit_cum))
return false;
+#ifdef USE_NO_SIMD
/* Check if any high bits in the zero accumulator got cleared. */
- if (zero_cum != UINT64CONST(0x8080808080808080))
+ if (zero_cum != vector8_broadcast(0x80))
return false;
+#endif
return true;
}
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 56df989094..8f85153110 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -38,6 +38,7 @@ typedef __m128i Vector8;
* If no SIMD instructions are available, we can in some cases emulate vector
* operations using bitwise operations on unsigned integers.
*/
+#define USE_NO_SIMD
typedef uint64 Vector8;
#endif
@@ -47,7 +48,11 @@ static inline Vector8 vector8_broadcast(const uint8 c);
static inline bool vector8_has_zero(const Vector8 v);
static inline bool vector8_has(const Vector8 v, const uint8 c);
static inline bool vector8_has_le(const Vector8 v, const uint8 c);
-
+static inline bool vector8_is_highbit_set(const Vector8 v);
+static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
+#ifndef USE_NO_SIMD
+static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+#endif
/*
* Functions for loading a chunk of memory into a vector.
@@ -181,4 +186,38 @@ vector8_has_le(const Vector8 v, const uint8 c)
return result;
}
+static inline bool
+vector8_is_highbit_set(const Vector8 v)
+{
+#ifdef USE_SSE2
+ return _mm_movemask_epi8(v) != 0;
+#else
+ return v & vector8_broadcast(0x80);
+#endif
+}
+
+/* comparisons between vectors */
+
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_eq(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_cmpeq_epi8(v1, v2);
+#endif
+}
+#endif
+
+/* bitwise operations */
+
+static inline Vector8
+vector8_or(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_or_si128(v1, v2);
+#else
+ return v1 | v2;
+#endif
+}
+
#endif /* SIMD_H */