Re: Bug in jsonb_in function (14 & 15 version are affected)

Tom Lane Mon, 13 Mar 2023 10:58:19 -0700

Nikolay Shaplov <[email protected]> writes:
> I found a bug in jsonb_in function (it converts json from sting representation
>  into jsonb internal representation).


Yeah.  Looks like json_lex_string is failing to honor the invariant
that it needs to set token_terminator ... although the documentation
of the function certainly isn't helping.  I think we need the attached.

A nice side benefit is that the error context reports get a lot more
useful --- somebody should have inquired before as to why they were
so bogus.

                        regards, tom lane

diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c
index bdfc48cdf5..7a36f74dad 100644
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -675,6 +675,7 @@ report_json_context(JsonLexContext *lex)
 	line_start = lex->line_start;
 	context_start = line_start;
 	context_end = lex->token_terminator;
+	Assert(context_end >= context_start);
 
 	/* Advance until we are close enough to context_end */
 	while (context_end - context_start >= 50)
diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index e4ff3f3602..4e2a664603 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -697,6 +697,14 @@ json_lex(JsonLexContext *lex)
 
 /*
  * The next token in the input stream is known to be a string; lex it.
+ *
+ * If lex->strval isn't NULL, fill it with the decoded string.
+ * Set lex->token_terminator to the end of the decoded input, and in
+ * success cases, transfer its previous value to lex->prev_token_terminator.
+ * Return JSON_SUCCESS or an error code.
+ *
+ * Note: be careful that all error exits advance lex->token_terminator
+ * to the point after the character we detected the error on.
  */
 static inline JsonParseErrorType
 json_lex_string(JsonLexContext *lex)
@@ -705,6 +713,19 @@ json_lex_string(JsonLexContext *lex)
 	char	   *const end = lex->input + lex->input_length;
 	int			hi_surrogate = -1;
 
+	/* Convenience macros */
+#define FAIL_AT_CHAR_START(code) \
+	do { \
+		lex->token_terminator = s; \
+		return code; \
+	} while (0)
+#define FAIL_AT_CHAR_END(code) \
+	do { \
+		lex->token_terminator = \
+			s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
+		return code; \
+	} while (0)
+
 	if (lex->strval != NULL)
 		resetStringInfo(lex->strval);
 
@@ -715,10 +736,7 @@ json_lex_string(JsonLexContext *lex)
 		s++;
 		/* Premature end of the string. */
 		if (s >= end)
-		{
-			lex->token_terminator = s;
-			return JSON_INVALID_TOKEN;
-		}
+			FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
 		else if (*s == '"')
 			break;
 		else if (*s == '\\')
@@ -726,10 +744,7 @@ json_lex_string(JsonLexContext *lex)
 			/* OK, we have an escape character. */
 			s++;
 			if (s >= end)
-			{
-				lex->token_terminator = s;
-				return JSON_INVALID_TOKEN;
-			}
+				FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
 			else if (*s == 'u')
 			{
 				int			i;
@@ -739,10 +754,7 @@ json_lex_string(JsonLexContext *lex)
 				{
 					s++;
 					if (s >= end)
-					{
-						lex->token_terminator = s;
-						return JSON_INVALID_TOKEN;
-					}
+						FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
 					else if (*s >= '0' && *s <= '9')
 						ch = (ch * 16) + (*s - '0');
 					else if (*s >= 'a' && *s <= 'f')
@@ -750,10 +762,7 @@ json_lex_string(JsonLexContext *lex)
 					else if (*s >= 'A' && *s <= 'F')
 						ch = (ch * 16) + (*s - 'A') + 10;
 					else
-					{
-						lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
-						return JSON_UNICODE_ESCAPE_FORMAT;
-					}
+						FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
 				}
 				if (lex->strval != NULL)
 				{
@@ -763,20 +772,20 @@ json_lex_string(JsonLexContext *lex)
 					if (is_utf16_surrogate_first(ch))
 					{
 						if (hi_surrogate != -1)
-							return JSON_UNICODE_HIGH_SURROGATE;
+							FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
 						hi_surrogate = ch;
 						continue;
 					}
 					else if (is_utf16_surrogate_second(ch))
 					{
 						if (hi_surrogate == -1)
-							return JSON_UNICODE_LOW_SURROGATE;
+							FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 						ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
 						hi_surrogate = -1;
 					}
 
 					if (hi_surrogate != -1)
-						return JSON_UNICODE_LOW_SURROGATE;
+						FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 
 					/*
 					 * Reject invalid cases.  We can't have a value above
@@ -786,7 +795,7 @@ json_lex_string(JsonLexContext *lex)
 					if (ch == 0)
 					{
 						/* We can't allow this, since our TEXT type doesn't */
-						return JSON_UNICODE_CODE_POINT_ZERO;
+						FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
 					}
 
 					/*
@@ -800,7 +809,7 @@ json_lex_string(JsonLexContext *lex)
 						char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
 
 						if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
-							return JSON_UNICODE_UNTRANSLATABLE;
+							FAIL_AT_CHAR_END(JSON_UNICODE_UNTRANSLATABLE);
 						appendStringInfoString(lex->strval, cbuf);
 					}
 #else
@@ -820,14 +829,14 @@ json_lex_string(JsonLexContext *lex)
 						appendStringInfoChar(lex->strval, (char) ch);
 					}
 					else
-						return JSON_UNICODE_HIGH_ESCAPE;
+						FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
 #endif							/* FRONTEND */
 				}
 			}
 			else if (lex->strval != NULL)
 			{
 				if (hi_surrogate != -1)
-					return JSON_UNICODE_LOW_SURROGATE;
+					FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 
 				switch (*s)
 				{
@@ -852,10 +861,14 @@ json_lex_string(JsonLexContext *lex)
 						appendStringInfoChar(lex->strval, '\t');
 						break;
 					default:
-						/* Not a valid string escape, so signal error. */
+
+						/*
+						 * Not a valid string escape, so signal error.  We
+						 * adjust token_start so that just the escape sequence
+						 * is reported, not the whole string.
+						 */
 						lex->token_start = s;
-						lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
-						return JSON_ESCAPING_INVALID;
+						FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
 				}
 			}
 			else if (strchr("\"\\/bfnrt", *s) == NULL)
@@ -868,8 +881,7 @@ json_lex_string(JsonLexContext *lex)
 				 * shown it's not a performance win.
 				 */
 				lex->token_start = s;
-				lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
-				return JSON_ESCAPING_INVALID;
+				FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
 			}
 		}
 		else
@@ -877,7 +889,7 @@ json_lex_string(JsonLexContext *lex)
 			char	   *p = s;
 
 			if (hi_surrogate != -1)
-				return JSON_UNICODE_LOW_SURROGATE;
+				FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
 
 			/*
 			 * Skip to the first byte that requires special handling, so we
@@ -917,12 +929,18 @@ json_lex_string(JsonLexContext *lex)
 	}
 
 	if (hi_surrogate != -1)
+	{
+		lex->token_terminator = s + 1;
 		return JSON_UNICODE_LOW_SURROGATE;
+	}
 
 	/* Hooray, we found the end of the string! */
 	lex->prev_token_terminator = lex->token_terminator;
 	lex->token_terminator = s + 1;
 	return JSON_SUCCESS;
+
+#undef FAIL_AT_CHAR_START
+#undef FAIL_AT_CHAR_END
 }
 
 /*
diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out
index f18ba9ebb2..fe729db8c9 100644
--- a/src/test/regress/expected/json_encoding.out
+++ b/src/test/regress/expected/json_encoding.out
@@ -56,19 +56,19 @@ select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
 select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 --handling of simple unicode escapes
 select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
             correct_in_utf8            
@@ -121,7 +121,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
 select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
@@ -159,7 +159,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT '"\u0000"'::jsonb;
                ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
 -- use octet_length here so we don't get an odd unicode char in the
 -- output
 SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -180,25 +180,25 @@ ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 -- handling of simple unicode escapes
 SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
         correct_in_utf8        
@@ -223,7 +223,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
         not_an_escape         
 ------------------------------
@@ -253,7 +253,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out
index 77bdaf63a1..5c8d91ad0b 100644
--- a/src/test/regress/expected/json_encoding_1.out
+++ b/src/test/regress/expected/json_encoding_1.out
@@ -50,23 +50,23 @@ SELECT '"\uaBcD"'::json;		-- OK, uppercase and lower case both OK
 select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  Unicode escape value could not be translated to the server's encoding SQL_ASCII.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ude04...
 select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 --handling of simple unicode escapes
 select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
             correct_in_utf8            
@@ -101,7 +101,7 @@ select json '{ "a":  "null \\u0000 escape" }' as not_an_escape;
 select json '{ "a":  "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  Unicode escape value could not be translated to the server's encoding SQL_ASCII.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
 select json '{ "a":  "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
  correct_everywhere 
 --------------------
@@ -117,7 +117,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
 select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
@@ -155,7 +155,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT '"\u0000"'::jsonb;
                ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
 -- use octet_length here so we don't get an odd unicode char in the
 -- output
 SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -163,45 +163,45 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text);
                             ^
 DETAIL:  Unicode escape value could not be translated to the server's encoding SQL_ASCII.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\uaBcD...
 -- handling of unicode surrogate pairs
 SELECT octet_length((jsonb '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT octet_length((jsonb '{ "a":  "\ud83d\ude04\ud83d\udc3...
                                    ^
 DETAIL:  Unicode escape value could not be translated to the server's encoding SQL_ASCII.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ude04...
 SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 -- handling of simple unicode escapes
 SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as corr...
                      ^
 DETAIL:  Unicode escape value could not be translated to the server's encoding SQL_ASCII.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
 SELECT jsonb '{ "a":  "dollar \u0024 character" }' as correct_everywhere;
      correct_everywhere      
 -----------------------------
@@ -219,7 +219,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
         not_an_escape         
 ------------------------------
@@ -231,7 +231,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' ->> 'a'...
                      ^
 DETAIL:  Unicode escape value could not be translated to the server's encoding SQL_ASCII.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
 SELECT jsonb '{ "a":  "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
  correct_everywhere 
 --------------------
@@ -249,7 +249,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------

Re: Bug in jsonb_in function (14 & 15 version are affected)

Reply via email to