On Wed, Jul 6, 2022 at 12:18 PM Andres Freund <and...@anarazel.de> wrote:

> I think before committing something along those lines we should make the
> relevant bits also be applicable when ->strval is NULL, as several functions
> use that (notably json_in IIRC). Afaics we'd just need to move the strval
> check to be around the appendBinaryStringInfo().

That makes sense and is easy.

> And it should simplify the
> function, because some of the relevant code is duplicated outside as well...

Not sure how far to take this, but I put the returnable paths inside
the "other" path, so only backslash will go back to the top.

Both the above changes are split into a new 0003 patch for easier
review, but in the end will likely be squashed with 0002.

-- 
John Naylor
EDB: http://www.enterprisedb.com
From 3d8b39ff1c1a4abf9effc45323b293b62551770a Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 6 Jul 2022 08:35:24 +0700
Subject: [PATCH v4 1/4] Simplify json lexing state

Instead of updating the length as we go, use a const pointer to end of
the input, which we know already at the start
---
 src/common/jsonapi.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index 98e4ef0942..eeedc0645a 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -519,26 +519,23 @@ JsonParseErrorType
 json_lex(JsonLexContext *lex)
 {
 	char	   *s;
-	int			len;
+	char	   *const end = lex->input + lex->input_length;
 	JsonParseErrorType result;
 
 	/* Skip leading whitespace. */
 	s = lex->token_terminator;
-	len = s - lex->input;
-	while (len < lex->input_length &&
-		   (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
+	while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
 	{
 		if (*s++ == '\n')
 		{
 			++lex->line_number;
 			lex->line_start = s;
 		}
-		len++;
 	}
 	lex->token_start = s;
 
 	/* Determine token type. */
-	if (len >= lex->input_length)
+	if (s >= end)
 	{
 		lex->token_start = NULL;
 		lex->prev_token_terminator = lex->token_terminator;
@@ -623,7 +620,7 @@ json_lex(JsonLexContext *lex)
 					 * the whole word as an unexpected token, rather than just
 					 * some unintuitive prefix thereof.
 					 */
-					for (p = s; p - s < lex->input_length - len && JSON_ALPHANUMERIC_CHAR(*p); p++)
+					for (p = s; p < end && JSON_ALPHANUMERIC_CHAR(*p); p++)
 						 /* skip */ ;
 
 					/*
@@ -672,7 +669,7 @@ static inline JsonParseErrorType
 json_lex_string(JsonLexContext *lex)
 {
 	char	   *s;
-	int			len;
+	char	   *const end = lex->input + lex->input_length;
 	int			hi_surrogate = -1;
 
 	if (lex->strval != NULL)
@@ -680,13 +677,11 @@ json_lex_string(JsonLexContext *lex)
 
 	Assert(lex->input_length > 0);
 	s = lex->token_start;
-	len = lex->token_start - lex->input;
 	for (;;)
 	{
 		s++;
-		len++;
 		/* Premature end of the string. */
-		if (len >= lex->input_length)
+		if (s >= end)
 		{
 			lex->token_terminator = s;
 			return JSON_INVALID_TOKEN;
@@ -704,8 +699,7 @@ json_lex_string(JsonLexContext *lex)
 		{
 			/* OK, we have an escape character. */
 			s++;
-			len++;
-			if (len >= lex->input_length)
+			if (s >= end)
 			{
 				lex->token_terminator = s;
 				return JSON_INVALID_TOKEN;
@@ -718,8 +712,7 @@ json_lex_string(JsonLexContext *lex)
 				for (i = 1; i <= 4; i++)
 				{
 					s++;
-					len++;
-					if (len >= lex->input_length)
+					if (s >= end)
 					{
 						lex->token_terminator = s;
 						return JSON_INVALID_TOKEN;
-- 
2.36.1

From 82e13b6bebd85a152ededcfd75495c0c0f642354 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 6 Jul 2022 15:50:09 +0700
Subject: [PATCH v4 4/4] Use vectorized lookahead in json_lex_string on x86

---
 src/common/jsonapi.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index 81e176ad8d..44e8ed2b2f 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -24,6 +24,12 @@
 #include "miscadmin.h"
 #endif
 
+/*  WIP: put somewhere sensible and consider removing CRC from the names */
+#if (defined (__x86_64__) || defined(_M_AMD64)) && (defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK))
+#include <nmmintrin.h>
+#define USE_SSE2
+#endif
+
 /*
  * The context of the parser is maintained by the recursive descent
  * mechanism, but is passed explicitly to the error reporting routine
@@ -842,12 +848,54 @@ json_lex_string(JsonLexContext *lex)
 		}
 		else
 		{
+#ifdef USE_SSE2
+			__m128i		block,
+						has_backslash,
+						has_doublequote,
+						control,
+						has_control,
+						error_cum = _mm_setzero_si128();
+			const		__m128i backslash = _mm_set1_epi8('\\');
+			const		__m128i doublequote = _mm_set1_epi8('"');
+			const		__m128i max_control = _mm_set1_epi8(0x1F);
+#endif
 			/* start lookahead at current byte */
 			char	   *p = s;
 
 			if (hi_surrogate != -1)
 				return JSON_UNICODE_LOW_SURROGATE;
 
+#ifdef USE_SSE2
+			while (p < end - sizeof(__m128i))
+			{
+				block = _mm_loadu_si128((const __m128i *) p);
+
+				/* direct comparison to quotes and backslashes */
+				has_backslash = _mm_cmpeq_epi8(block, backslash);
+				has_doublequote = _mm_cmpeq_epi8(block, doublequote);
+
+				/*
+				 * use saturation arithmetic to check for <= highest control
+				 * char
+				 */
+				control = _mm_subs_epu8(block, max_control);
+				has_control = _mm_cmpeq_epi8(control, _mm_setzero_si128());
+
+				/*
+				 * set bits in error_cum where the corresponding lanes in has_*
+				 * are set
+				 */
+				error_cum = _mm_or_si128(error_cum, has_backslash);
+				error_cum = _mm_or_si128(error_cum, has_doublequote);
+				error_cum = _mm_or_si128(error_cum, has_control);
+
+				if (_mm_movemask_epi8(error_cum))
+					break;
+
+				p += sizeof(__m128i);
+			}
+#endif							/* USE_SSE2 */
+
 			while (p < end)
 			{
 				if (*p == '\\' || *p == '"')
-- 
2.36.1

From ef486287090daa24d51735ba9fa9585341b6e8ec Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 6 Jul 2022 15:35:33 +0700
Subject: [PATCH v4 3/4] Use lookahead path in json string lexing for the
 non-escape case too

This removes some duplicated code and enables the no-escape path
to be optimized in the same way.

Per suggestion from Andres Freund
---
 src/common/jsonapi.c | 46 +++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index ad4858c623..81e176ad8d 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -686,15 +686,6 @@ json_lex_string(JsonLexContext *lex)
 			lex->token_terminator = s;
 			return JSON_INVALID_TOKEN;
 		}
-		else if (*s == '"')
-			break;
-		else if ((unsigned char) *s < 32)
-		{
-			/* Per RFC4627, these characters MUST be escaped. */
-			/* Since *s isn't printable, exclude it from the context string */
-			lex->token_terminator = s;
-			return JSON_ESCAPING_REQUIRED;
-		}
 		else if (*s == '\\')
 		{
 			/* OK, we have an escape character. */
@@ -849,22 +840,41 @@ json_lex_string(JsonLexContext *lex)
 				return JSON_ESCAPING_INVALID;
 			}
 		}
-		else if (lex->strval != NULL)
+		else
 		{
-			/* start lookahead at next byte */
-			char	   *p = s + 1;
+			/* start lookahead at current byte */
+			char	   *p = s;
 
 			if (hi_surrogate != -1)
 				return JSON_UNICODE_LOW_SURROGATE;
 
 			while (p < end)
 			{
-				if (*p == '\\' || *p == '"' || (unsigned char) *p < 32)
+				if (*p == '\\' || *p == '"')
 					break;
+				else if ((unsigned char) *p < 32)
+				{
+					/* Per RFC4627, these characters MUST be escaped. */
+					/*
+					 * Since *s isn't printable, exclude it from the context
+					 * string
+					 */
+					lex->token_terminator = p;
+					return JSON_ESCAPING_REQUIRED;
+				}
 				p++;
 			}
 
-			appendBinaryStringInfo(lex->strval, s, p - s);
+			if (lex->strval != NULL)
+				appendBinaryStringInfo(lex->strval, s, p - s);
+
+			if (*p == '"')
+			{
+				/* Hooray, we found the end of the string! */
+				lex->prev_token_terminator = lex->token_terminator;
+				lex->token_terminator = p + 1;
+				return JSON_SUCCESS;
+			}
 
 			/*
 			 * s will be incremented at the top of the loop, so set it to just
@@ -873,14 +883,6 @@ json_lex_string(JsonLexContext *lex)
 			s = p - 1;
 		}
 	}
-
-	if (hi_surrogate != -1)
-		return JSON_UNICODE_LOW_SURROGATE;
-
-	/* Hooray, we found the end of the string! */
-	lex->prev_token_terminator = lex->token_terminator;
-	lex->token_terminator = s + 1;
-	return JSON_SUCCESS;
 }
 
 /*
-- 
2.36.1

From 03d3be083ba60d272e848b3ec96db3c6d47a3b06 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 1 Jul 2022 17:28:20 +0700
Subject: [PATCH v4 2/4] Build json strings in larger chunks during lexing

Add lookahead loop to json_lex_string. This way, we can batch calls
to appendBinaryStringInfo.

Jelte Fennema and Andres Freund, with some adjustments by me

Discussion:
https://www.postgresql.org/message-id/CAGECzQQuXbies_nKgSiYifZUjBk6nOf2%3DTSXqRjj2BhUh8CTeA%40mail.gmail.com
Discussion:
https://www.postgresql.org/message-id/flat/pr3pr83mb0476f098cbcf68af7a1ca89ff7...@pr3pr83mb0476.eurprd83.prod.outlook.com
---
 src/common/jsonapi.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index eeedc0645a..ad4858c623 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -851,10 +851,26 @@ json_lex_string(JsonLexContext *lex)
 		}
 		else if (lex->strval != NULL)
 		{
+			/* start lookahead at next byte */
+			char	   *p = s + 1;
+
 			if (hi_surrogate != -1)
 				return JSON_UNICODE_LOW_SURROGATE;
 
-			appendStringInfoChar(lex->strval, *s);
+			while (p < end)
+			{
+				if (*p == '\\' || *p == '"' || (unsigned char) *p < 32)
+					break;
+				p++;
+			}
+
+			appendBinaryStringInfo(lex->strval, s, p - s);
+
+			/*
+			 * s will be incremented at the top of the loop, so set it to just
+			 * behind our lookahead position
+			 */
+			s = p - 1;
 		}
 	}
 
-- 
2.36.1

Reply via email to