Re: speed up verifying UTF-8

John Naylor Thu, 15 Jul 2021 15:00:37 -0700

I wrote:

> To simplify the constants, I do shift down to uint32, and I didn't bother
working around that. v16alpha regressed on worst-case input, so for v16beta
I went back to earlier coding for the one-byte ascii check. That helped,
but it's still slower than v14.


It occurred to me that I could rewrite the switch test into simple
comparisons, like I already had for the 2- and 4-byte lead cases. While at
it, I folded the leading byte and continuation tests into a single
operation, like this:

/* 3-byte lead with two continuation bytes */
else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)

...and also tried using 64-bit constants to avoid shifting. Still didn't
quite beat v14, but got pretty close:

> The numbers on Power8 / gcc 4.8 (little endian):
>
> HEAD:
>
>  chinese | mixed | ascii | mixed16 | mixed8
> ---------+-------+-------+---------+--------
>     2951 |  1521 |   871 |    1474 |   1508
>
> v14:
>
>  chinese | mixed | ascii | mixed16 | mixed8
> ---------+-------+-------+---------+--------
>      885 |   607 |   179 |     774 |   1325

v16gamma:

 chinese | mixed | ascii | mixed16 | mixed8
---------+-------+-------+---------+--------
     952 |   632 |   180 |     800 |   1333

A big-endian 64-bit platform just might shave enough cycles to beat v14
this way... or not.

--
John Naylor
EDB: http://www.enterprisedb.com

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..f48d79638c 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,8 +13,41 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_bswap.h"
 
 
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const uint64 chunk)
+{
+       uint64
+                               highbits_set,
+                               highbit_carry;
+
+       /* Check if any bytes in this chunk have the high bit set. */
+       highbits_set = chunk & UINT64CONST(0x8080808080808080);
+       if (highbits_set)
+               return 0;
+
+       /*
+        * Check if there are any zero bytes in this chunk.
+        *
+        * First, add 0x7f to each byte. This sets the high bit in each byte,
+        * unless it was a zero. We already checked that none of the bytes had 
the
+        * high bit set previously, so the max value each byte can have after 
the
+        * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about
+        * carrying over to the next byte.
+        */
+       highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+       /* Then check that the high bit is set in each byte. */
+       highbit_carry &= UINT64CONST(0x8080808080808080);
+       if (highbit_carry == UINT64CONST(0x8080808080808080))
+               return sizeof(chunk);
+       else
+               return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1728,6 +1761,67 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
        return s - start;
 }
 
+/*
+ * Workhorse for pg_utf8_verifychar(). Returns the length of the character
+ * at *s in bytes, or -1 on invalid input or premature end of input.
+ * Static inline for the benefit of pg_utf8_verifystr().
+ */
+static inline int
+pg_utf8_verifychar_internal(const uint64 chunk_orig)
+{
+       const uint64 chunk = (pg_hton64(chunk_orig));
+
+       /* high bit should be set */
+       Assert((chunk & 0x8000000000000000) != 0);
+
+       /* 2-byte lead with one continuation byte */
+       if ((chunk & 0xE0C0000000000000) == 0xC080000000000000)
+       {
+               /* check 2-byte overlong: 1100.000x.10xx.xxxx */
+               if (chunk < 0xC200000000000000)
+                       return -1;
+
+               /* found valid sequence for code points U+0080 through U+07FF */
+               return 2;
+       }
+       /* 3-byte lead with two continuation bytes */
+       else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)
+       {
+               /* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */
+               if (chunk < 0xE0A0000000000000)
+                       return -1;
+
+               /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+               if (chunk > 0xED9FBFFFffffffff && chunk < 0xEE00000000000000)
+                       return -1;
+
+               /*
+                * found valid sequence for code points U+0800 through U+D7FF or
+                * U+E000 through U+FFFF
+                */
+               return 3;
+       }
+       /* 4-byte lead with three continuation bytes */
+       else if ((chunk & 0xF8C0C0C000000000) == 0xF080808000000000)
+       {
+               /*
+                * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 
10xx.xxxx
+                */
+               if (chunk < 0xF090000000000000)
+                       return -1;
+
+               /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+               if (chunk > 0xF48FBFBFffffffff)
+                       return -1;
+
+               /* found valid sequence for code points U+010000 through 
U+10FFFF */
+               return 4;
+       }
+       else
+               /* invalid byte */
+               return -1;
+}
+
 static int
 pg_utf8_verifychar(const unsigned char *s, int len)
 {
@@ -1761,28 +1855,62 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
        const unsigned char *start = s;
+       uint64          chunk;
 
-       while (len > 0)
+       /*
+        * Fast path for when we have enough bytes left in the string to give
+        * check_ascii() a chance to advance the pointer. This also allows the
+        * functions in this loop to skip length checks.
+        */
+       while (len >= sizeof(chunk))
        {
                int                     l;
 
+               memcpy(&chunk, s, sizeof(chunk));
+
                /* fast path for ASCII-subset characters */
+               l = check_ascii(chunk);
+               if (l)
+                       goto advance;
+
+               /*
+                * Found non-ASCII or zero above, so verify a single character. 
First check the first byte for ASCII.
+                */
                if (!IS_HIGHBIT_SET(*s))
                {
                        if (*s == '\0')
-                               break;
-                       l = 1;
-               }
-               else
-               {
-                       l = pg_utf8_verifychar(s, len);
-                       if (l == -1)
-                               break;
+                               goto end;
+                       else
+                       {
+                               l = 1;
+                               goto advance;
+                       }
                }
+
+               /* Check for valid multibyte input. Since we already have the 
integer chunk, use that here as well. */
+               l = pg_utf8_verifychar_internal(chunk);
+               if (l == -1)
+                       goto end;
+
+advance:
+               s += l;
+               len -= l;
+       }
+
+       /* Slow path to handle the last few bytes in the string */
+       while (len > 0)
+       {
+               int                     l;
+
+               l = pg_utf8_verifychar(s, len);
+               if (l == -1)
+                       goto end;
+
                s += l;
                len -= l;
        }
 
+end:
        return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out 
b/src/test/regress/expected/conversion.out
index 04fdcba496..92b5df62c8 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY 
KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',             'bare continuation'),
+  ('\xc5',             'missing second byte in 2-byte char'),
+  ('\xc080',   'smallest 2-byte overlong'),
+  ('\xc1bf',   'largest 2-byte overlong'),
+  ('\xc280',   'next 2-byte after overlongs'),
+  ('\xdfbf',   'largest 2-byte'),
+  ('\xe9af',   'missing third byte in 3-byte char'),
+  ('\xe08080', 'smallest 3-byte overlong'),
+  ('\xe09fbf', 'largest 3-byte overlong'),
+  ('\xe0a080', 'next 3-byte after overlong'),
+  ('\xed9fbf', 'last before surrogates'),
+  ('\xeda080', 'smallest surrogate'),
+  ('\xedbfbf', 'largest surrogate'),
+  ('\xee8080', 'next after surrogates'),
+  ('\xefbfbf', 'largest 3-byte'),
+  ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+  ('\xf0808080',       'smallest 4-byte overlong'),
+  ('\xf08fbfbf',       'largest 4-byte overlong'),
+  ('\xf0908080',       'next 4-byte after overlong'),
+  ('\xf48fbfbf',       'largest 4-byte'),
+  ('\xf4908080',       'smallest too large'),
+  ('\xfa9a9a8a8a',     '5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from 
utf8_verification_inputs;
+            description             |   result   |   errorat    |              
               error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte 
sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte 
sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte 
sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte 
sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte 
sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte 
sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte 
sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte 
sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte 
sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte 
sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte 
sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte 
sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte 
sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte 
sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte 
sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',         'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql 
b/src/test/regress/sql/conversion.sql
index 8358682432..a3e12961db 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY 
KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',             'bare continuation'),
+  ('\xc5',             'missing second byte in 2-byte char'),
+  ('\xc080',   'smallest 2-byte overlong'),
+  ('\xc1bf',   'largest 2-byte overlong'),
+  ('\xc280',   'next 2-byte after overlongs'),
+  ('\xdfbf',   'largest 2-byte'),
+  ('\xe9af',   'missing third byte in 3-byte char'),
+  ('\xe08080', 'smallest 3-byte overlong'),
+  ('\xe09fbf', 'largest 3-byte overlong'),
+  ('\xe0a080', 'next 3-byte after overlong'),
+  ('\xed9fbf', 'last before surrogates'),
+  ('\xeda080', 'smallest surrogate'),
+  ('\xedbfbf', 'largest surrogate'),
+  ('\xee8080', 'next after surrogates'),
+  ('\xefbfbf', 'largest 3-byte'),
+  ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+  ('\xf0808080',       'smallest 4-byte overlong'),
+  ('\xf08fbfbf',       'largest 4-byte overlong'),
+  ('\xf0908080',       'next 4-byte after overlong'),
+  ('\xf48fbfbf',       'largest 4-byte'),
+  ('\xf4908080',       'smallest too large'),
+  ('\xfa9a9a8a8a',     '5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from 
utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',         'valid, pure ASCII'),

Re: speed up verifying UTF-8

Reply via email to