On 29/06/2021 14:20, John Naylor wrote:
I still wasn't quite happy with the churn in the regression tests, so
for v13 I gave up on using both the existing utf8 table and my new one
for the "padded input" tests, and instead just copied the NUL byte test
into the new table. Also added a primary key to make sure the padded
test won't give weird results if a new entry has a duplicate description.
I came up with "highbit_carry" as a more descriptive variable name than
"x", but that doesn't matter a whole lot.
It also occurred to me that if we're going to check one 8-byte chunk at
a time (like v12 does), maybe it's only worth it to load 8 bytes at a
time. An earlier version did this, but without the recent tweaks. The
worst-case scenario now might be different from the one with 16-bytes,
but for now just tested the previous worst case (mixed2).
I tested the new worst case scenario on my laptop:
gcc master:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1311 | 758 | 405 | 583 | 725
gcc v13:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
956 | 472 | 160 | 572 | 939
mixed16 is the same as "mixed2" in the previous rounds, with
'123456789012345ä' as the repeating string, and mixed8 uses '1234567ä',
which I believe is the worst case for patch v13. So v13 is somewhat
slower than master in the worst case.
Hmm, there's one more simple trick we can do: We can have a separate
fast-path version of the loop when there are at least 8 bytes of input
left, skipping all the length checks. With that:
gcc v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
737 | 412 | 94 | 476 | 725
All the above numbers were with gcc 10.2.1. For completeness, with clang
11.0.1-2 I got:
clang master:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
1044 | 724 | 403 | 930 | 603
(1 row)
clang v13:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
596 | 445 | 79 | 417 | 715
(1 row)
clang v14:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
600 | 337 | 93 | 318 | 511
Attached is patch v14 with that optimization. It needs some cleanup, I
just hacked it up quickly for performance testing.
- Heikki
>From 731b61ede9fb4c25b1bedeaafb065015704faafd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Wed, 30 Jun 2021 14:15:55 +0300
Subject: [PATCH v14 1/1] Rewrite pg_utf8_verifystr() for speed
Based on John Naylor's v13 patch here:
https://www.postgresql.org/message-id/CAFBsxsH9xJpru2U6_ua963LV8LP34%3DbJRaESUTUS1mH6Y-m%2B_g%40mail.gmail.com
I added a separate fast-path version of the loop for when there is at least
8 bytes of input left.
---
src/common/wchar.c | 187 +++++++++++++++++++++--
src/test/regress/expected/conversion.out | 85 +++++++++++
src/test/regress/sql/conversion.sql | 57 +++++++
3 files changed, 318 insertions(+), 11 deletions(-)
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765ba..3ccef6c3cbe 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,51 @@
#include "mb/pg_wchar.h"
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+ uint64 chunk,
+ highbits_set,
+ highbit_carry;
+
+ if (len >= sizeof(uint64))
+ {
+ memcpy(&chunk, s, sizeof(uint64));
+
+ /* Check if any bytes in this chunk have the high bit set. */
+ highbits_set = chunk & UINT64CONST(0x8080808080808080);
+ if (highbits_set)
+ return 0;
+
+ /*
+ * Check if there are any zero bytes in this chunk.
+ *
+ * First, add 0x7f to each byte. This sets the high bit in each byte,
+ * unless it was a zero. We already checked that none of the bytes had
+ * the high bit set previously, so the max value each byte can have
+ * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+ * worry about carrying over to the next byte.
+ */
+ highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+ /* Then check that the high bit is set in each byte. */
+ highbit_carry &= UINT64CONST(0x8080808080808080);
+ if (highbit_carry == UINT64CONST(0x8080808080808080))
+ return sizeof(uint64);
+ else
+ return 0;
+ }
+ else
+ return 0;
+}
+
/*
* Operations on multi-byte encodings are driven by a table of helper
* functions.
@@ -1757,32 +1802,152 @@ pg_utf8_verifychar(const unsigned char *s, int len)
return l;
}
+/*
+ * Subroutine of pg_utf8_verifystr() to check on char. Returns the length of the
+ * character at *s in bytes, or 0 on invalid input or premature end of input.
+ *
+ * XXX: could this be combined with pg_utf8_verifychar above?
+ */
+static inline int
+pg_utf8_verify_one(const unsigned char *s, int len)
+{
+ int l;
+ unsigned char b1,
+ b2,
+ b3,
+ b4;
+
+ /* Found non-ASCII or zero above, so verify a single character. */
+ if (!IS_HIGHBIT_SET(*s))
+ {
+ if (*s == '\0')
+ return 0;
+ l = 1;
+ }
+ /* code points U+0080 through U+07FF */
+ else if (IS_TWO_BYTE_LEAD(*s))
+ {
+ l = 2;
+ if (len < l)
+ return 0;
+
+ b1 = *s;
+ b2 = *(s + 1);
+
+ if (!IS_CONTINUATION_BYTE(b2))
+ return 0;
+
+ /* check 2-byte overlong: 1100.000x.10xx.xxxx */
+ if (b1 < 0xC2)
+ return 0;
+ }
+ /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+ else if (IS_THREE_BYTE_LEAD(*s))
+ {
+ l = 3;
+ if (len < l)
+ return 0;
+
+ b1 = *s;
+ b2 = *(s + 1);
+ b3 = *(s + 2);
+
+ if (!IS_CONTINUATION_BYTE(b2) ||
+ !IS_CONTINUATION_BYTE(b3))
+ return 0;
+
+ /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+ if (b1 == 0xE0 && b2 < 0xA0)
+ return 0;
+
+ /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+ if (b1 == 0xED && b2 > 0x9F)
+ return 0;
+ }
+ /* code points U+010000 through U+10FFFF */
+ else if (IS_FOUR_BYTE_LEAD(*s))
+ {
+ l = 4;
+ if (len < l)
+ return 0;
+
+ b1 = *s;
+ b2 = *(s + 1);
+ b3 = *(s + 2);
+ b4 = *(s + 3);
+
+ if (!IS_CONTINUATION_BYTE(b2) ||
+ !IS_CONTINUATION_BYTE(b3) ||
+ !IS_CONTINUATION_BYTE(b4))
+ return 0;
+
+ /*
+ * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+ */
+ if (b1 == 0xF0 && b2 < 0x90)
+ return 0;
+
+ /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+ if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+ return 0;
+ }
+ else
+ /* invalid byte */
+ return 0;
+
+ return l;
+}
+
+
static int
pg_utf8_verifystr(const unsigned char *s, int len)
{
const unsigned char *start = s;
- while (len > 0)
+ /*
+ * Fast path when we have at least 8 bytes left in the string. We can skip the
+ * length checks in the loop.
+ */
+ while (len >= 8)
{
int l;
/* fast path for ASCII-subset characters */
- if (!IS_HIGHBIT_SET(*s))
- {
- if (*s == '\0')
- break;
- l = 1;
- }
- else
+ l = check_ascii(s, 8);
+ if (l)
{
- l = pg_utf8_verifychar(s, len);
- if (l == -1)
- break;
+ s += l;
+ len -= l;
+ continue;
}
+
+ /*
+ * Found non-ASCII or zero above, so verify a single character.
+ * By passing length as constant, the compiler should optimize away
+ * the length-checks in pg_utf8_verify_one.
+ */
+ l = pg_utf8_verify_one(s, 8);
+ if (l == 0)
+ goto end;
+
+ s += l;
+ len -= l;
+ }
+
+ /* Slow path to handle the last 7 bytes in the string */
+ while (len > 0)
+ {
+ int l;
+
+ l = pg_utf8_verify_one(s, len);
+ if (l == 0)
+ goto end;
+
s += l;
len -= l;
}
+end:
return s - start;
}
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba4964..92b5df62c8e 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
--
-- UTF-8
--
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs values
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5-byte'),
+ ('\x66006f', 'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+ description | result | errorat | error
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs | \xc280 | |
+ largest 2-byte | \xdfbf | |
+ missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong | \xe0a080 | |
+ last before surrogates | \xed9fbf | |
+ smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates | \xee8080 | |
+ largest 3-byte | \xefbfbf | |
+ missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong | \xf0908080 | |
+ largest 4-byte | \xf48fbfbf | |
+ smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+ -- The error message for a sequence starting with a 4-byte lead
+ -- will contain all 4 bytes if they are present, so add 3
+ -- ASCII bytes to the end to ensure consistent error messages.
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding
+-------------+------------+---------------------
+(0 rows)
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 83586824321..a3e12961db8 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
--
-- UTF-8
--
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs values
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5-byte'),
+ ('\x66006f', 'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+ -- The error message for a sequence starting with a 4-byte lead
+ -- will contain all 4 bytes if they are present, so add 3
+ -- ASCII bytes to the end to ensure consistent error messages.
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),
--
2.30.2