From 039ddd985d24a2efccbf9fc34bc5ef36a3cfd9bb Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Fri, 17 Dec 2021 12:55:34 -0400
Subject: [PATCH v2 1/3] Move the implementation of pg_utf_mblen() to an inline
 function

Use that to specialize pg_mblen() for UTF-8. This provides a modest
speedup for code that calls pg_mblen() in a loop.

This has a side effect of removing the unnecessary check for zero bytes
in pg_utf8_verifychar().

WIP: Maybe "fast" in the name is misleading -- the point is to be inlinable.
---
 src/backend/utils/mb/mbutils.c |  6 ++++-
 src/common/wchar.c             | 45 ++--------------------------------
 src/include/mb/pg_wchar.h      | 33 +++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index a13c398f4a..91eea625b9 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -965,7 +965,11 @@ pg_encoding_wchar2mb_with_len(int encoding,
 int
 pg_mblen(const char *mbstr)
 {
-	return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+	/* avoid the overhead of a function call for UTF-8 */
+	if (GetDatabaseEncoding() == PG_UTF8)
+		return pg_utf_mblen_fast((const unsigned char *) mbstr);
+	else
+		return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
 }
 
 /* returns the display length of a multibyte character */
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a6bffd0642..5fd682829c 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -536,37 +536,11 @@ pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
 
 /*
  * Return the byte length of a UTF8 character pointed to by s
- *
- * Note: in the current implementation we do not support UTF8 sequences
- * of more than 4 bytes; hence do NOT return a value larger than 4.
- * We return "1" for any leading byte that is either flat-out illegal or
- * indicates a length larger than we support.
- *
- * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
- * other places would need to be fixed to change this.
  */
 int
 pg_utf_mblen(const unsigned char *s)
 {
-	int			len;
-
-	if ((*s & 0x80) == 0)
-		len = 1;
-	else if ((*s & 0xe0) == 0xc0)
-		len = 2;
-	else if ((*s & 0xf0) == 0xe0)
-		len = 3;
-	else if ((*s & 0xf8) == 0xf0)
-		len = 4;
-#ifdef NOT_USED
-	else if ((*s & 0xfc) == 0xf8)
-		len = 5;
-	else if ((*s & 0xfe) == 0xfc)
-		len = 6;
-#endif
-	else
-		len = 1;
-	return len;
+	return pg_utf_mblen_fast(s);
 }
 
 /*
@@ -1724,22 +1698,7 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 static int
 pg_utf8_verifychar(const unsigned char *s, int len)
 {
-	int			l;
-
-	if ((*s & 0x80) == 0)
-	{
-		if (*s == '\0')
-			return -1;
-		return 1;
-	}
-	else if ((*s & 0xe0) == 0xc0)
-		l = 2;
-	else if ((*s & 0xf0) == 0xe0)
-		l = 3;
-	else if ((*s & 0xf8) == 0xf0)
-		l = 4;
-	else
-		l = 1;
+	int			l = pg_utf_mblen_fast(s);
 
 	if (l > len)
 		return -1;
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d93ccac263..a8d67c1214 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -590,6 +590,39 @@ extern bool pg_utf8_islegal(const unsigned char *source, int length);
 extern int	pg_utf_mblen(const unsigned char *s);
 extern int	pg_mule_mblen(const unsigned char *s);
 
+/*
+ * Return the byte length of a UTF8 character pointed to by s
+ * Workhorse for pg_utf_mblen().
+ *
+ * Declared as inline for callers of pg_mblen() that are performance critical
+ * enough to justify specializing for UTF-8.
+ *
+ * Note: in the current implementation we do not support UTF8 sequences
+ * of more than 4 bytes; hence do NOT return a value larger than 4.
+ * We return "1" for any leading byte that is either flat-out illegal or
+ * indicates a length larger than we support.
+ *
+ * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
+ * other places would need to be fixed to change this.
+ */
+static inline int
+pg_utf_mblen_fast(const unsigned char *s)
+{
+	int			len;
+
+	if ((*s & 0x80) == 0)
+		len = 1;
+	else if ((*s & 0xe0) == 0xc0)
+		len = 2;
+	else if ((*s & 0xf0) == 0xe0)
+		len = 3;
+	else if ((*s & 0xf8) == 0xf0)
+		len = 4;
+	else
+		len = 1;
+	return len;
+}
+
 /*
  * The remaining functions are backend-only.
  */
-- 
2.31.1

