I wrote:
> To simplify the constants, I do shift down to uint32, and I didn't bother
working around that. v16alpha regressed on worst-case input, so for v16beta
I went back to earlier coding for the one-byte ascii check. That helped,
but it's still slower than v14.
It occurred to me that I could rewrite the switch test into simple
comparisons, like I already had for the 2- and 4-byte lead cases. While at
it, I folded the leading byte and continuation tests into a single
operation, like this:
/* 3-byte lead with two continuation bytes */
else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)
...and also tried using 64-bit constants to avoid shifting. Still didn't
quite beat v14, but got pretty close:
> The numbers on Power8 / gcc 4.8 (little endian):
>
> HEAD:
>
> chinese | mixed | ascii | mixed16 | mixed8
> ---------+-------+-------+---------+--------
> 2951 | 1521 | 871 | 1474 | 1508
>
> v14:
>
> chinese | mixed | ascii | mixed16 | mixed8
> ---------+-------+-------+---------+--------
> 885 | 607 | 179 | 774 | 1325
v16gamma:
chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
952 | 632 | 180 | 800 | 1333
A big-endian 64-bit platform just might shave enough cycles to beat v14
this way... or not.
--
John Naylor
EDB: http://www.enterprisedb.com
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..f48d79638c 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,8 +13,41 @@
#include "c.h"
#include "mb/pg_wchar.h"
+#include "port/pg_bswap.h"
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const uint64 chunk)
+{
+ uint64
+ highbits_set,
+ highbit_carry;
+
+ /* Check if any bytes in this chunk have the high bit set. */
+ highbits_set = chunk & UINT64CONST(0x8080808080808080);
+ if (highbits_set)
+ return 0;
+
+ /*
+ * Check if there are any zero bytes in this chunk.
+ *
+ * First, add 0x7f to each byte. This sets the high bit in each byte,
+ * unless it was a zero. We already checked that none of the bytes had
the
+ * high bit set previously, so the max value each byte can have after
the
+ * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about
+ * carrying over to the next byte.
+ */
+ highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+ /* Then check that the high bit is set in each byte. */
+ highbit_carry &= UINT64CONST(0x8080808080808080);
+ if (highbit_carry == UINT64CONST(0x8080808080808080))
+ return sizeof(chunk);
+ else
+ return 0;
+}
+
/*
* Operations on multi-byte encodings are driven by a table of helper
* functions.
@@ -1728,6 +1761,67 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
return s - start;
}
+/*
+ * Workhorse for pg_utf8_verifychar(). Returns the length of the character
+ * at *s in bytes, or -1 on invalid input or premature end of input.
+ * Static inline for the benefit of pg_utf8_verifystr().
+ */
+static inline int
+pg_utf8_verifychar_internal(const uint64 chunk_orig)
+{
+ const uint64 chunk = (pg_hton64(chunk_orig));
+
+ /* high bit should be set */
+ Assert((chunk & 0x8000000000000000) != 0);
+
+ /* 2-byte lead with one continuation byte */
+ if ((chunk & 0xE0C0000000000000) == 0xC080000000000000)
+ {
+ /* check 2-byte overlong: 1100.000x.10xx.xxxx */
+ if (chunk < 0xC200000000000000)
+ return -1;
+
+ /* found valid sequence for code points U+0080 through U+07FF */
+ return 2;
+ }
+ /* 3-byte lead with two continuation bytes */
+ else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)
+ {
+ /* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */
+ if (chunk < 0xE0A0000000000000)
+ return -1;
+
+ /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+ if (chunk > 0xED9FBFFFffffffff && chunk < 0xEE00000000000000)
+ return -1;
+
+ /*
+ * found valid sequence for code points U+0800 through U+D7FF or
+ * U+E000 through U+FFFF
+ */
+ return 3;
+ }
+ /* 4-byte lead with three continuation bytes */
+ else if ((chunk & 0xF8C0C0C000000000) == 0xF080808000000000)
+ {
+ /*
+ * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx
10xx.xxxx
+ */
+ if (chunk < 0xF090000000000000)
+ return -1;
+
+ /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+ if (chunk > 0xF48FBFBFffffffff)
+ return -1;
+
+ /* found valid sequence for code points U+010000 through
U+10FFFF */
+ return 4;
+ }
+ else
+ /* invalid byte */
+ return -1;
+}
+
static int
pg_utf8_verifychar(const unsigned char *s, int len)
{
@@ -1761,28 +1855,62 @@ static int
pg_utf8_verifystr(const unsigned char *s, int len)
{
const unsigned char *start = s;
+ uint64 chunk;
- while (len > 0)
+ /*
+ * Fast path for when we have enough bytes left in the string to give
+ * check_ascii() a chance to advance the pointer. This also allows the
+ * functions in this loop to skip length checks.
+ */
+ while (len >= sizeof(chunk))
{
int l;
+ memcpy(&chunk, s, sizeof(chunk));
+
/* fast path for ASCII-subset characters */
+ l = check_ascii(chunk);
+ if (l)
+ goto advance;
+
+ /*
+ * Found non-ASCII or zero above, so verify a single character.
First check the first byte for ASCII.
+ */
if (!IS_HIGHBIT_SET(*s))
{
if (*s == '\0')
- break;
- l = 1;
- }
- else
- {
- l = pg_utf8_verifychar(s, len);
- if (l == -1)
- break;
+ goto end;
+ else
+ {
+ l = 1;
+ goto advance;
+ }
}
+
+ /* Check for valid multibyte input. Since we already have the
integer chunk, use that here as well. */
+ l = pg_utf8_verifychar_internal(chunk);
+ if (l == -1)
+ goto end;
+
+advance:
+ s += l;
+ len -= l;
+ }
+
+ /* Slow path to handle the last few bytes in the string */
+ while (len > 0)
+ {
+ int l;
+
+ l = pg_utf8_verifychar(s, len);
+ if (l == -1)
+ goto end;
+
s += l;
len -= l;
}
+end:
return s - start;
}
diff --git a/src/test/regress/expected/conversion.out
b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
--
-- UTF-8
--
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY
KEY);
+insert into utf8_verification_inputs values
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5-byte'),
+ ('\x66006f', 'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from
utf8_verification_inputs;
+ description | result | errorat |
error
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation | \x | \xaf | invalid byte
sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x | \xc5 | invalid byte
sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong | \x | \xc080 | invalid byte
sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong | \x | \xc1bf | invalid byte
sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs | \xc280 | |
+ largest 2-byte | \xdfbf | |
+ missing third byte in 3-byte char | \x | \xe9af | invalid byte
sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong | \x | \xe08080 | invalid byte
sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong | \x | \xe09fbf | invalid byte
sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong | \xe0a080 | |
+ last before surrogates | \xed9fbf | |
+ smallest surrogate | \x | \xeda080 | invalid byte
sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate | \x | \xedbfbf | invalid byte
sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates | \xee8080 | |
+ largest 3-byte | \xefbfbf | |
+ missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte
sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong | \x | \xf0808080 | invalid byte
sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong | \x | \xf08fbfbf | invalid byte
sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong | \xf0908080 | |
+ largest 4-byte | \xf48fbfbf | |
+ smallest too large | \x | \xf4908080 | invalid byte
sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte | \x | \xfa9a9a8a8a | invalid byte
sequence for encoding "UTF8": 0xfa
+ NUL byte | \x66 | \x006f | invalid byte
sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+ -- The error message for a sequence starting with a 4-byte lead
+ -- will contain all 4 bytes if they are present, so add 3
+ -- ASCII bytes to the end to ensure consistent error messages.
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding
+-------------+------------+---------------------
+(0 rows)
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql
b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
--
-- UTF-8
--
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY
KEY);
+insert into utf8_verification_inputs values
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5-byte'),
+ ('\x66006f', 'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from
utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+ -- The error message for a sequence starting with a 4-byte lead
+ -- will contain all 4 bytes if they are present, so add 3
+ -- ASCII bytes to the end to ensure consistent error messages.
+ select
+ inbytes,
+ description,
+ (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+ from utf8_verification_inputs
+), test_padded as (
+ select
+ description,
+ (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+ from test_bytes
+)
+select
+ description,
+ b.error as orig_error,
+ p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),