From c10bf1271e586c2cdebfb8e05a2dd9533c850d4a Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Fri, 16 Jul 2021 18:16:03 -0400
Subject: [PATCH v17 2/2] Use integer chunk for fast path multibyte check

Based on idea from Amit Khandekar:
https://www.postgresql.org/message-id/CAJ3gD9ejC%2BpuY%3DLgco2SGyD4tR46kye7qLZoskW0PXumtLcCpQ%40mail.gmail.com
---
 src/common/wchar.c | 158 ++++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 102 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 3ccef6c3cb..ec4bbb3b6a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,49 +13,36 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_bswap.h"
 
 
-/* for UTF-8 */
-#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
-#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
-#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
-#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
-
 /* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
 static inline int
-check_ascii(const unsigned char *s, int len)
+check_ascii(const uint64 chunk)
 {
-	uint64		chunk,
-				highbits_set,
+	uint64		highbits_set,
 				highbit_carry;
 
-	if (len >= sizeof(uint64))
-	{
-		memcpy(&chunk, s, sizeof(uint64));
-
-		/* Check if any bytes in this chunk have the high bit set. */
-		highbits_set = chunk & UINT64CONST(0x8080808080808080);
-		if (highbits_set)
-			return 0;
+	/* Check if any bytes in this chunk have the high bit set. */
+	highbits_set = chunk & UINT64CONST(0x8080808080808080);
+	if (highbits_set)
+		return 0;
 
-		/*
-		 * Check if there are any zero bytes in this chunk.
-		 *
-		 * First, add 0x7f to each byte. This sets the high bit in each byte,
-		 * unless it was a zero. We already checked that none of the bytes had
-		 * the high bit set previously, so the max value each byte can have
-		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
-		 * worry about carrying over to the next byte.
-		 */
-		highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+	/*
+	 * Check if there are any zero bytes in this chunk.
+	 *
+	 * First, add 0x7f to each byte. This sets the high bit in each byte,
+	 * unless it was a zero. We already checked that none of the bytes had the
+	 * high bit set previously, so the max value each byte can have after the
+	 * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about
+	 * carrying over to the next byte.
+	 */
+	highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f);
 
-		/* Then check that the high bit is set in each byte. */
-		highbit_carry &= UINT64CONST(0x8080808080808080);
-		if (highbit_carry == UINT64CONST(0x8080808080808080))
-			return sizeof(uint64);
-		else
-			return 0;
-	}
+	/* Then check that the high bit is set in each byte. */
+	highbit_carry &= UINT64CONST(0x8080808080808080);
+	if (highbit_carry == UINT64CONST(0x8080808080808080))
+		return sizeof(chunk);
 	else
 		return 0;
 }
@@ -1804,92 +1791,60 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 
 /*
  * Subroutine of pg_utf8_verifystr() to check on char. Returns the length of the
- * character at *s in bytes, or 0 on invalid input or premature end of input.
- *
- * XXX: could this be combined with pg_utf8_verifychar above?
+ * character at the start of the chunk in bytes, or 0 on invalid input or premature
+ * end of input.
  */
 static inline int
-pg_utf8_verify_one(const unsigned char *s, int len)
+pg_utf8_verify_one(const uint64 chunk_orig)
 {
 	int			l;
-	unsigned char b1,
-				b2,
-				b3,
-				b4;
+	const uint64 chunk = (pg_hton64(chunk_orig));
 
-	/* Found non-ASCII or zero above, so verify a single character. */
-	if (!IS_HIGHBIT_SET(*s))
+	/* high bit not set */
+	if ((chunk & 0x8000000000000000) == 0)
 	{
-		if (*s == '\0')
+		/* check first byte for zero */
+		if (chunk < 0x0100000000000000)
 			return 0;
+
 		l = 1;
 	}
-	/* code points U+0080 through U+07FF */
-	else if (IS_TWO_BYTE_LEAD(*s))
+	/* 2-byte lead with one continuation byte */
+	else if ((chunk & 0xE0C0000000000000) == 0xC080000000000000)
 	{
-		l = 2;
-		if (len < l)
-			return 0;
-
-		b1 = *s;
-		b2 = *(s + 1);
-
-		if (!IS_CONTINUATION_BYTE(b2))
-			return 0;
-
 		/* check 2-byte overlong: 1100.000x.10xx.xxxx */
-		if (b1 < 0xC2)
+		if (chunk < 0xC200000000000000)
 			return 0;
+
+		l = 2;
 	}
-	/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
-	else if (IS_THREE_BYTE_LEAD(*s))
+	/* 3-byte lead with two continuation bytes */
+	else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000)
 	{
-		l = 3;
-		if (len < l)
-			return 0;
-
-		b1 = *s;
-		b2 = *(s + 1);
-		b3 = *(s + 2);
-
-		if (!IS_CONTINUATION_BYTE(b2) ||
-			!IS_CONTINUATION_BYTE(b3))
-			return 0;
-
-		/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
-		if (b1 == 0xE0 && b2 < 0xA0)
+		/* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */
+		if (chunk < 0xE0A0000000000000)
 			return 0;
 
 		/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
-		if (b1 == 0xED && b2 > 0x9F)
+		if (chunk > 0xED9FBFFFffffffff && chunk < 0xEE00000000000000)
 			return 0;
+
+		l = 3;
 	}
-	/* code points U+010000 through U+10FFFF */
-	else if (IS_FOUR_BYTE_LEAD(*s))
+	/* 4-byte lead with three continuation bytes */
+	else if ((chunk & 0xF8C0C0C000000000) == 0xF080808000000000)
 	{
-		l = 4;
-		if (len < l)
-			return 0;
-
-		b1 = *s;
-		b2 = *(s + 1);
-		b3 = *(s + 2);
-		b4 = *(s + 3);
-
-		if (!IS_CONTINUATION_BYTE(b2) ||
-			!IS_CONTINUATION_BYTE(b3) ||
-			!IS_CONTINUATION_BYTE(b4))
-			return 0;
-
 		/*
 		 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
 		 */
-		if (b1 == 0xF0 && b2 < 0x90)
+		if (chunk < 0xF090000000000000)
 			return 0;
 
 		/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
-		if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+		if (chunk > 0xF48FBFBFffffffff)
 			return 0;
+
+		l = 4;
 	}
 	else
 		/* invalid byte */
@@ -1898,22 +1853,23 @@ pg_utf8_verify_one(const unsigned char *s, int len)
 	return l;
 }
 
-
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	uint64		chunk;
 
 	/*
-	 * Fast path when we have at least 8 bytes left in the string. We can skip the
-	 * length checks in the loop.
+	 * Fast path when we have at least 8 bytes left in the string.
 	 */
 	while (len >= 8)
 	{
 		int			l;
 
+		memcpy(&chunk, s, sizeof(chunk));
+
 		/* fast path for ASCII-subset characters */
-		l = check_ascii(s, 8);
+		l = check_ascii(chunk);
 		if (l)
 		{
 			s += l;
@@ -1923,10 +1879,8 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 
 		/*
 		 * Found non-ASCII or zero above, so verify a single character.
-		 * By passing length as constant, the compiler should optimize away
-		 * the length-checks in pg_utf8_verify_one.
 		 */
-		l = pg_utf8_verify_one(s, 8);
+		l = pg_utf8_verify_one(chunk);
 		if (l == 0)
 			goto end;
 
@@ -1939,8 +1893,8 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	{
 		int			l;
 
-		l = pg_utf8_verify_one(s, len);
-		if (l == 0)
+		l = pg_utf8_verifychar(s, len);
+		if (l == -1)
 			goto end;
 
 		s += l;
-- 
2.31.1

