From ecce09c9184e09e59992aa146192f62557f1da8e Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 7 Feb 2021 15:30:10 -0400
Subject: [PATCH v1] Add an ASCII fast path to multibyte encoding verification
 functions.

A large amount of database input consists of ASCII, regardless of locale,
so it makes sense to optimize for this. Using bitwise operations, verifying
the pure ASCII subset of a multibyte encoding is now several times faster.
Verifying pure multibyte strings is slightly slower, but that's not common.
ASCII interspersed with sections of multibyte text is more typical, and
that speeds up around 10%, depending on platform.
---
 src/common/wchar.c | 181 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 159 insertions(+), 22 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..0bd3175171 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,10 @@
 #include "mb/pg_wchar.h"
 
 
+#define ASCII_STRIDE_LENGTH sizeof(uint64)
+
+typedef int (*verifychar_func) (const unsigned char *s, int len);
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1105,6 +1109,62 @@ pg_gb18030_dsplen(const unsigned char *s)
  * single-byte encoding to be just "return 1".
  *-------------------------------------------------------------------
  */
+
+/*
+ * Provides a fast path for verifying ASCII bytes found in a string
+ * that is encoded in a multibyte encoding. The verifychar parameter
+ * is the corresponding pg_*_verifychar function.
+ */
+static int
+pg_verifystr_fast(const unsigned char *s, int len, verifychar_func verifychar)
+{
+	const unsigned char *start = s;
+	int			remainder;
+	uint64		chunk,
+				highbit_mask;
+
+	/* If there are zero bytes, bail and let the slow path handle it. */
+	const unsigned char *nullpos = memchr(s, 0, len);
+	if (nullpos != NULL)
+		return 0;
+
+	while (len >= ASCII_STRIDE_LENGTH)
+	{
+		chunk = *((uint64 *) s);
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbit_mask = (chunk & UINT64CONST(0x8080808080808080));
+
+		if (!highbit_mask)
+		{
+			/* All ASCII, so advance to the next chunk. */
+			s += ASCII_STRIDE_LENGTH;
+			len -= ASCII_STRIDE_LENGTH;
+			continue;
+		}
+
+		/*
+		 * Found non-ASCII in this chunk, so handle using the given
+		 * pg_*_verifychar() function.
+		 */
+		remainder = ASCII_STRIDE_LENGTH;
+		while (remainder > 0)
+		{
+			int			l;
+
+			l = (*verifychar)(s, len);
+			if (l == -1)
+				return s - start;
+
+			s += l;
+			len -= l;
+			remainder -= l;
+		}
+	}
+
+	return s - start;
+}
+
 static int
 pg_ascii_verifychar(const unsigned char *s, int len)
 {
@@ -1183,11 +1243,18 @@ static int
 pg_eucjp_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_eucjp_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1241,11 +1308,18 @@ static int
 pg_euckr_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_euckr_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1324,11 +1398,18 @@ static int
 pg_euctw_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_euctw_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1377,11 +1458,18 @@ static int
 pg_johab_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_johab_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1427,11 +1515,18 @@ static int
 pg_mule_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_mule_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1496,11 +1591,18 @@ static int
 pg_sjis_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_sjis_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1545,11 +1647,18 @@ static int
 pg_big5_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_big5_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1594,11 +1703,18 @@ static int
 pg_gbk_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_gbk_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1643,11 +1759,18 @@ static int
 pg_uhc_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_uhc_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1703,11 +1826,18 @@ static int
 pg_gb18030_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int			l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_gb18030_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
@@ -1761,11 +1891,18 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	int				l;
 
-	while (len > 0)
+	/* fast path for longer sequences of ASCII-subset characters */
+	if (len > ASCII_STRIDE_LENGTH)
 	{
-		int			l;
+		l = pg_verifystr_fast(s, len, pg_utf8_verifychar);
+		s += l;
+		len -= l;
+	}
 
+	while (len > 0)
+	{
 		/* fast path for ASCII-subset characters */
 		if (!IS_HIGHBIT_SET(*s))
 		{
-- 
2.22.0

