I wrote:
> We can also shave a
> few percent by having pg_utf8_verifystr use SSE2 for the ascii path. I
> can look into this.
Here's a patch for that. If the input is mostly ascii, I'd expect that
part of the flame graph to shrink by 40-50% and give a small boost
overall.
--
John Naylor
EDB: http://www.enterprisedb.com
src/common/wchar.c | 18 ++++--------
src/include/mb/pg_wchar.h | 50 ++++++++++++++++++++++++++++++--
src/test/regress/expected/conversion.out | 3 +-
src/test/regress/sql/conversion.sql | 3 +-
4 files changed, 58 insertions(+), 16 deletions(-)
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..a305e0e66b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,26 +1918,20 @@ pg_utf8_verifystr(const unsigned char *s, int len)
const int orig_len = len;
uint32 state = BGN;
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
-
- if (len >= STRIDE_LENGTH)
+ if (len >= ASCII_CHECK_LEN)
{
- while (len >= STRIDE_LENGTH)
+ while (len >= ASCII_CHECK_LEN)
{
/*
* If the chunk is all ASCII, we can skip the full UTF-8 check,
* but we must first check for a non-END state, which means the
* previous chunk ended in the middle of a multibyte sequence.
*/
- if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
- utf8_advance(s, &state, STRIDE_LENGTH);
+ if (state != END || !is_valid_ascii(s, ASCII_CHECK_LEN))
+ utf8_advance(s, &state, ASCII_CHECK_LEN);
- s += STRIDE_LENGTH;
- len -= STRIDE_LENGTH;
+ s += ASCII_CHECK_LEN;
+ len -= ASCII_CHECK_LEN;
}
/* The error state persists, so we only need to check for it here. */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 31f5b393da..ca238c212b 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -700,19 +700,64 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
#endif
+/*
+ * Note: We piggy-back on the check for SSE 4.2 intrinsics but only need SSE2 at runtime.
+ * That's supported by all x86-64 hardware, so we don't need an indirect function call.
+ * WIP: put this somewhere useful
+ */
+#if (defined (__x86_64__) || defined(_M_AMD64)) && (defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK))
+#include <nmmintrin.h>
+#define USE_SSE2
+#endif
+
/*
* Verify a chunk of bytes for valid ASCII.
*
* Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
*/
static inline bool
is_valid_ascii(const unsigned char *s, int len)
{
+#ifdef USE_SSE2
+ __m128i chunk,
+ error_cum = _mm_setzero_si128(),
+ zeros;
+
+/*
+ * With two chunks, gcc can unroll the loop, so provide a convenience macro for
+ * callers. Even if the compiler can unroll a longer loop, it's not worth it
+ * because callers might have to use a byte-wise algorithm if we return false.
+ */
+#define ASCII_CHECK_LEN (2 * sizeof(__m128i))
+ Assert(len % sizeof(chunk) == 0);
+
+ while (len > 0)
+ {
+ chunk = _mm_loadu_si128((const __m128i *) s);
+
+ /* Capture all set bits in this chunk. */
+ error_cum = _mm_or_si128(error_cum, chunk);
+
+ /*
+ * Set all bits in each lane of the error accumulator where input bytes are zero.
+ */
+ zeros = _mm_cmpeq_epi8(chunk, _mm_setzero_si128());
+ error_cum = _mm_or_si128(error_cum, zeros);
+
+ s += sizeof(chunk);
+ len -= sizeof(chunk);
+ }
+
+ /* Check if any high bits in the error accumulator got set. */
+ return _mm_movemask_epi8(error_cum) == 0;
+
+#else
uint64 chunk,
highbit_cum = UINT64CONST(0),
zero_cum = UINT64CONST(0x8080808080808080);
+#define ASCII_CHECK_LEN (2 * sizeof(uint64))
Assert(len % sizeof(chunk) == 0);
while (len > 0)
@@ -734,7 +779,7 @@ is_valid_ascii(const unsigned char *s, int len)
*/
zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
- /* Capture any set bits in this chunk. */
+ /* Capture all set bits in this chunk. */
highbit_cum |= chunk;
s += sizeof(chunk);
@@ -750,6 +795,7 @@ is_valid_ascii(const unsigned char *s, int len)
return false;
return true;
+#endif /* USE_SSE2 */
}
#endif /* PG_WCHAR_H */
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 442e7aff2b..434dc4d93c 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -140,7 +140,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
-- will contain all 4 bytes if they are present, so various
-- expressions below add 3 ASCII bytes to the end to ensure
-- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
-- Test multibyte verification in fast path
with test_bytes as (
select
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 9a65fca91f..27ef069eaf 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -121,7 +121,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
-- will contain all 4 bytes if they are present, so various
-- expressions below add 3 ASCII bytes to the end to ensure
-- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
-- Test multibyte verification in fast path
with test_bytes as (