From 482df5ff99a40c64f1d87b08b41eac0206082eef Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 18 Jul 2021 17:14:32 -0400
Subject: [PATCH v18 1/6] Use pure DFA

Based on https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
---
 src/common/wchar.c                       | 182 +++++++++++++++++++++--
 src/test/regress/expected/conversion.out |  85 +++++++++++
 src/test/regress/sql/conversion.sql      |  57 +++++++
 3 files changed, 311 insertions(+), 13 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..aafc602bcd 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1757,32 +1757,188 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
+/* possible transition states for the UTF-8 DFA */
+
+#define DFA_BITS_PER_STATE 6
+#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
+
+/* Start */
+#define	BGN UINT64CONST(0)
+/* Invalid sequence */
+#define	ERR (UINT64CONST(1) * DFA_BITS_PER_STATE)
+/* Continuation states */
+#define	CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)
+#define	CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)
+#define	CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)
+/* Partial 3-byte sequence states */
+#define	P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)
+#define	P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)
+/* Partial 4-byte sequence states */
+#define	P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)
+#define	P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)
+/* Start and End are the same state */
+#define	END BGN
+
+/*
+ * The DFA transition table would look like this if encoded as an array
+ * (ERR is lower case for readability).
+ *
+ * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
+ * =========================================================================
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,     // BGN|END
+ * err, err, err, err, err, err, err, err, err, err, err, err,     // ERR
+ *
+ * err, err, END, END, END, err, err, err, err, err, err, err,     // CS1
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,     // CS2
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,     // CS3
+ *
+ * err, err, err, err, CS1, err, err, err, err, err, err, err,     // P3A
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err,     // P3B
+ *
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err,     // P4A
+ * err, err, CS2, err, err, err, err, err, err, err, err, err,     // P4B
+ */
+
+/* Encode each transition within DFA_BITS_PER_STATE-sized sequences of bits. */
+
+#define ERR_ON_ALL_NON_BGN_STATES (ERR << ERR) | (ERR << CS1) | (ERR << CS2) | (ERR << CS3) | (ERR << P3A) | (ERR << P3B) | (ERR << P4A) | (ERR << P4B)
+
+/* 00, C0..C1, F5..FF  Invalid bytes that never appear in a UTF-8 sequence */
+#define	ILL ERR | ERR_ON_ALL_NON_BGN_STATES
+
+/* 01..7F  Non-zero ASCII */
+#define NZA END | ERR_ON_ALL_NON_BGN_STATES
+
+/* 80..8F  Continuation range 1 */
+#define CR1 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (ERR << P4A) | (CS2 << P4B)
+
+/* 90..9F  Continuation range 2 */
+#define CR2 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (CS2 << P4A) | (ERR << P4B)
+
+/* A0..BF  Continuation range 3 */
+#define CR3 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (ERR << P3B) | (CS2 << P4A) | (ERR << P4B)
+
+/* C2..DF  2-byte lead */
+#define L2A CS1 | ERR_ON_ALL_NON_BGN_STATES
+
+/* E0      3-byte lead range A */
+#define L3A P3A | ERR_ON_ALL_NON_BGN_STATES
+
+/* E1..EC, EE..EF  3-byte lead range B */
+#define L3B CS2 | ERR_ON_ALL_NON_BGN_STATES
+
+/* ED      3-byte lead range C */
+#define L3C P3B | ERR_ON_ALL_NON_BGN_STATES
+
+/* F0      4-byte lead range A */
+#define L4A P4A | ERR_ON_ALL_NON_BGN_STATES
+
+/* F1..F3  4-byte lead range B */
+#define L4B CS3 | ERR_ON_ALL_NON_BGN_STATES
+
+/* F4      4-byte lead range C */
+#define L4C P4B | ERR_ON_ALL_NON_BGN_STATES
+
+/* maps an input byte to an 8-byte integer that encodes the possible state transitions */
+#define REP16(a) a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a
+const uint64 ByteCategory[256] =
+{
+	/* ASCII */
+
+	ILL, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA,
+	REP16(NZA),
+	REP16(NZA), REP16(NZA),
+	REP16(NZA), REP16(NZA),
+	REP16(NZA), REP16(NZA),
+
+	/* continuation bytes */
+
+	/* 80..8F */
+	REP16(CR1),
+
+	/* 90..9F */
+	REP16(CR2),
+
+	/* A0..BF */
+	REP16(CR3), REP16(CR3),
+
+	/* leading bytes */
+
+	/* C0..CF */
+	ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* D0..DF */
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+	L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
+
+	/* E0..EF */
+	L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
+	L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
+
+	/* F0..FF */
+	L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
+	ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL,
+};
+
+
+static inline int
+utf8_advance(const unsigned char *s)
+{
+	uint64		class;
+	uint64		state = BGN;
+	int			l = 0;
+
+	do
+	{
+		class = ByteCategory[*s++];
+		state = (class >> state) & DFA_MASK;
+		l++;
+	} while (state > ERR);
+
+	if (state == ERR)
+		return -1;
+
+	Assert(l <= 4);
+	return l;
+}
+
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
+	/*
+	 * fast path when we have enough bytes left in the string to cover all
+	 * valid UTF-8 sequences
+	 */
+	while (len >= 4)
+	{
+		int			l;
+
+		l = utf8_advance(s);
+		if (l == -1)
+			goto end;
+
+		s += l;
+		len -= l;
+	}
+
+	/* handle last few bytes */
 	while (len > 0)
 	{
 		int			l;
 
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
+			goto end;
+
 		s += l;
 		len -= l;
 	}
 
+end:
 	return s - start;
 }
 
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..e4ab9fe765 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,91 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5-byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+ NUL byte                           | \x66       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
+(23 rows)
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..e5a7e47958 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,63 @@ $$;
 --
 -- UTF-8
 --
+-- The description column must be unique.
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5-byte'),
+  ('\x66006f',    'NUL byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on multiple bytes at a time.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
-- 
2.31.1

