On Wed, Jul 6, 2022 at 12:18 PM Andres Freund <and...@anarazel.de> wrote:
> I think before committing something along those lines we should make the > relevant bits also be applicable when ->strval is NULL, as several functions > use that (notably json_in IIRC). Afaics we'd just need to move the strval > check to be around the appendBinaryStringInfo(). That makes sense and is easy. > And it should simplify the > function, because some of the relevant code is duplicated outside as well... Not sure how far to take this, but I put the returnable paths inside the "other" path, so only backslash will go back to the top. Both the above changes are split into a new 0003 patch for easier review, but in the end will likely be squashed with 0002. -- John Naylor EDB: http://www.enterprisedb.com
From 3d8b39ff1c1a4abf9effc45323b293b62551770a Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 6 Jul 2022 08:35:24 +0700 Subject: [PATCH v4 1/4] Simplify json lexing state Instead of updating the length as we go, use a const pointer to end of the input, which we know already at the start --- src/common/jsonapi.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index 98e4ef0942..eeedc0645a 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -519,26 +519,23 @@ JsonParseErrorType json_lex(JsonLexContext *lex) { char *s; - int len; + char *const end = lex->input + lex->input_length; JsonParseErrorType result; /* Skip leading whitespace. */ s = lex->token_terminator; - len = s - lex->input; - while (len < lex->input_length && - (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')) + while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')) { if (*s++ == '\n') { ++lex->line_number; lex->line_start = s; } - len++; } lex->token_start = s; /* Determine token type. */ - if (len >= lex->input_length) + if (s >= end) { lex->token_start = NULL; lex->prev_token_terminator = lex->token_terminator; @@ -623,7 +620,7 @@ json_lex(JsonLexContext *lex) * the whole word as an unexpected token, rather than just * some unintuitive prefix thereof. */ - for (p = s; p - s < lex->input_length - len && JSON_ALPHANUMERIC_CHAR(*p); p++) + for (p = s; p < end && JSON_ALPHANUMERIC_CHAR(*p); p++) /* skip */ ; /* @@ -672,7 +669,7 @@ static inline JsonParseErrorType json_lex_string(JsonLexContext *lex) { char *s; - int len; + char *const end = lex->input + lex->input_length; int hi_surrogate = -1; if (lex->strval != NULL) @@ -680,13 +677,11 @@ json_lex_string(JsonLexContext *lex) Assert(lex->input_length > 0); s = lex->token_start; - len = lex->token_start - lex->input; for (;;) { s++; - len++; /* Premature end of the string. */ - if (len >= lex->input_length) + if (s >= end) { lex->token_terminator = s; return JSON_INVALID_TOKEN; @@ -704,8 +699,7 @@ json_lex_string(JsonLexContext *lex) { /* OK, we have an escape character. */ s++; - len++; - if (len >= lex->input_length) + if (s >= end) { lex->token_terminator = s; return JSON_INVALID_TOKEN; @@ -718,8 +712,7 @@ json_lex_string(JsonLexContext *lex) for (i = 1; i <= 4; i++) { s++; - len++; - if (len >= lex->input_length) + if (s >= end) { lex->token_terminator = s; return JSON_INVALID_TOKEN; -- 2.36.1
From 82e13b6bebd85a152ededcfd75495c0c0f642354 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 6 Jul 2022 15:50:09 +0700 Subject: [PATCH v4 4/4] Use vectorized lookahead in json_lex_string on x86 --- src/common/jsonapi.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index 81e176ad8d..44e8ed2b2f 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -24,6 +24,12 @@ #include "miscadmin.h" #endif +/* WIP: put somewhere sensible and consider removing CRC from the names */ +#if (defined (__x86_64__) || defined(_M_AMD64)) && (defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)) +#include <nmmintrin.h> +#define USE_SSE2 +#endif + /* * The context of the parser is maintained by the recursive descent * mechanism, but is passed explicitly to the error reporting routine @@ -842,12 +848,54 @@ json_lex_string(JsonLexContext *lex) } else { +#ifdef USE_SSE2 + __m128i block, + has_backslash, + has_doublequote, + control, + has_control, + error_cum = _mm_setzero_si128(); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i doublequote = _mm_set1_epi8('"'); + const __m128i max_control = _mm_set1_epi8(0x1F); +#endif /* start lookahead at current byte */ char *p = s; if (hi_surrogate != -1) return JSON_UNICODE_LOW_SURROGATE; +#ifdef USE_SSE2 + while (p < end - sizeof(__m128i)) + { + block = _mm_loadu_si128((const __m128i *) p); + + /* direct comparison to quotes and backslashes */ + has_backslash = _mm_cmpeq_epi8(block, backslash); + has_doublequote = _mm_cmpeq_epi8(block, doublequote); + + /* + * use saturation arithmetic to check for <= highest control + * char + */ + control = _mm_subs_epu8(block, max_control); + has_control = _mm_cmpeq_epi8(control, _mm_setzero_si128()); + + /* + * set bits in error_cum where the corresponding lanes in has_* + * are set + */ + error_cum = _mm_or_si128(error_cum, has_backslash); + error_cum = _mm_or_si128(error_cum, has_doublequote); + error_cum = _mm_or_si128(error_cum, has_control); + + if (_mm_movemask_epi8(error_cum)) + break; + + p += sizeof(__m128i); + } +#endif /* USE_SSE2 */ + while (p < end) { if (*p == '\\' || *p == '"') -- 2.36.1
From ef486287090daa24d51735ba9fa9585341b6e8ec Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 6 Jul 2022 15:35:33 +0700 Subject: [PATCH v4 3/4] Use lookahead path in json string lexing for the non-escape case too This removes some duplicated code and enables the no-escape path to be optimized in the same way. Per suggestion from Andres Freund --- src/common/jsonapi.c | 46 +++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index ad4858c623..81e176ad8d 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -686,15 +686,6 @@ json_lex_string(JsonLexContext *lex) lex->token_terminator = s; return JSON_INVALID_TOKEN; } - else if (*s == '"') - break; - else if ((unsigned char) *s < 32) - { - /* Per RFC4627, these characters MUST be escaped. */ - /* Since *s isn't printable, exclude it from the context string */ - lex->token_terminator = s; - return JSON_ESCAPING_REQUIRED; - } else if (*s == '\\') { /* OK, we have an escape character. */ @@ -849,22 +840,41 @@ json_lex_string(JsonLexContext *lex) return JSON_ESCAPING_INVALID; } } - else if (lex->strval != NULL) + else { - /* start lookahead at next byte */ - char *p = s + 1; + /* start lookahead at current byte */ + char *p = s; if (hi_surrogate != -1) return JSON_UNICODE_LOW_SURROGATE; while (p < end) { - if (*p == '\\' || *p == '"' || (unsigned char) *p < 32) + if (*p == '\\' || *p == '"') break; + else if ((unsigned char) *p < 32) + { + /* Per RFC4627, these characters MUST be escaped. */ + /* + * Since *s isn't printable, exclude it from the context + * string + */ + lex->token_terminator = p; + return JSON_ESCAPING_REQUIRED; + } p++; } - appendBinaryStringInfo(lex->strval, s, p - s); + if (lex->strval != NULL) + appendBinaryStringInfo(lex->strval, s, p - s); + + if (*p == '"') + { + /* Hooray, we found the end of the string! */ + lex->prev_token_terminator = lex->token_terminator; + lex->token_terminator = p + 1; + return JSON_SUCCESS; + } /* * s will be incremented at the top of the loop, so set it to just @@ -873,14 +883,6 @@ json_lex_string(JsonLexContext *lex) s = p - 1; } } - - if (hi_surrogate != -1) - return JSON_UNICODE_LOW_SURROGATE; - - /* Hooray, we found the end of the string! */ - lex->prev_token_terminator = lex->token_terminator; - lex->token_terminator = s + 1; - return JSON_SUCCESS; } /* -- 2.36.1
From 03d3be083ba60d272e848b3ec96db3c6d47a3b06 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 1 Jul 2022 17:28:20 +0700 Subject: [PATCH v4 2/4] Build json strings in larger chunks during lexing Add lookahead loop to json_lex_string. This way, we can batch calls to appendBinaryStringInfo. Jelte Fennema and Andres Freund, with some adjustments by me Discussion: https://www.postgresql.org/message-id/CAGECzQQuXbies_nKgSiYifZUjBk6nOf2%3DTSXqRjj2BhUh8CTeA%40mail.gmail.com Discussion: https://www.postgresql.org/message-id/flat/pr3pr83mb0476f098cbcf68af7a1ca89ff7...@pr3pr83mb0476.eurprd83.prod.outlook.com --- src/common/jsonapi.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index eeedc0645a..ad4858c623 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -851,10 +851,26 @@ json_lex_string(JsonLexContext *lex) } else if (lex->strval != NULL) { + /* start lookahead at next byte */ + char *p = s + 1; + if (hi_surrogate != -1) return JSON_UNICODE_LOW_SURROGATE; - appendStringInfoChar(lex->strval, *s); + while (p < end) + { + if (*p == '\\' || *p == '"' || (unsigned char) *p < 32) + break; + p++; + } + + appendBinaryStringInfo(lex->strval, s, p - s); + + /* + * s will be incremented at the top of the loop, so set it to just + * behind our lookahead position + */ + s = p - 1; } } -- 2.36.1