This is an automated email from the ASF dual-hosted git repository.
morrySnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 22c9ed691d5 [fix](fe) Reject lone UTF-16 surrogates in JSONB literals
(RFC 8259 §8.2) (#63255)
22c9ed691d5 is described below
commit 22c9ed691d5c0708d20d6daf357bbd1c1ea97168
Author: morrySnow <[email protected]>
AuthorDate: Mon May 18 15:00:26 2026 +0800
[fix](fe) Reject lone UTF-16 surrogates in JSONB literals (RFC 8259 §8.2)
(#63255)
## Summary
**Problem fixed:** `JsonLiteral` (Nereids/Jackson path) and
`analysis.JsonLiteral` (legacy/Gson path) silently accepted lone UTF-16
surrogates (e.g. `'"\uD800"'::JSONB`) as valid JSONB literals. RFC 8259
§8.2 explicitly forbids unpaired surrogates in JSON strings because they
cannot be represented as valid UTF-8.
**How it was fixed:** Added a recursive `validateNoLoneSurrogate`
post-parse check in both `JsonLiteral` constructors. After Jackson/Gson
parses the JSON tree, the method walks all string nodes and immediately
throws `AnalysisException` on any lone high or low surrogate.
## What problem does this PR solve?
**Before this fix:** Passing a lone surrogate like `'"\uD800"'::JSONB`
was silently accepted at the FE layer. The invalid value would be stored
in the BE JSONB column. The error would only surface later — during
`EXPORT`, `SELECT INTO OUTFILE`, or cross-system transfer — making it
hard to diagnose. This is a data-correctness (SEV-2) issue.
**After this fix:** Constructing a `JsonLiteral` with a lone surrogate
immediately throws `AnalysisException: Invalid jsonb literal: JSON
string contains lone high surrogate` (or `lone low surrogate`), giving
the user a clear error at write time.
## Behavior change
| Scenario | Before | After |
|---|---|---|
| `'"\uD800"'::JSONB` | Accepted silently | AnalysisException at parse
time |
| `INSERT INTO t VALUES (1, '"\uD800"')` | Stored in BE, may fail on
export | AnalysisException at FE |
| `'"\uD83D\uDE00"'::JSONB` (valid pair 😀) | Accepted | Still accepted
(no change) |
| `'"hello"'::JSONB` (plain ASCII) | Accepted | Still accepted (no
change) |
## Why both paths?
Doris has two `JsonLiteral` implementations:
- **Nereids** (`fe-core`): uses Jackson `ObjectMapper.readTree` —
Jackson accepts lone surrogates by default
- **Legacy** (`fe-catalog`, `analysis`): uses Gson `JsonParser.parse` —
Gson also accepts lone surrogates by default
Both needed the same fix to ensure consistent behavior regardless of
which query path is used.
## Release note
JSONB literal expressions now reject strings containing lone UTF-16
surrogates (e.g. `'"\uD800"'::JSONB`) with an `AnalysisException` at
parse time, conforming to RFC 8259 §8.2. Previously such literals were
silently accepted, which could cause errors during export or
cross-system data transfer.
---------
Co-authored-by: Copilot <[email protected]>
---
.../org/apache/doris/analysis/JsonLiteral.java | 39 ++++++-
.../trees/expressions/literal/JsonLiteral.java | 39 ++++++-
.../trees/expressions/literal/JsonLiteralTest.java | 124 +++++++++++++++++++++
3 files changed, 200 insertions(+), 2 deletions(-)
diff --git
a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java
b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java
index 26af45d4bcf..f4ad4ab419b 100644
--- a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java
+++ b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java
@@ -20,10 +20,12 @@ package org.apache.doris.analysis;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
+import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import com.google.gson.JsonSyntaxException;
import com.google.gson.annotations.SerializedName;
+import java.util.Map;
import java.util.Objects;
public class JsonLiteral extends LiteralExpr {
@@ -41,7 +43,8 @@ public class JsonLiteral extends LiteralExpr {
public JsonLiteral(String value) throws AnalysisException {
try {
- parser.parse(value);
+ JsonElement element = parser.parse(value);
+ validateNoLoneSurrogate(element);
} catch (JsonSyntaxException e) {
throw new AnalysisException("Invalid jsonb literal: " +
e.getMessage());
}
@@ -50,6 +53,40 @@ public class JsonLiteral extends LiteralExpr {
this.nullable = false;
}
+ // RFC 8259 §8.2: JSON strings must not contain lone UTF-16 surrogates.
+ // Gson accepts them by default, so we validate after parsing.
+ // Both string values AND object field names are checked.
+ private static void validateNoLoneSurrogate(JsonElement element) throws
AnalysisException {
+ if (element.isJsonPrimitive() &&
element.getAsJsonPrimitive().isString()) {
+ validateNoLoneSurrogateInString(element.getAsString());
+ } else if (element.isJsonObject()) {
+ for (Map.Entry<String, JsonElement> entry :
element.getAsJsonObject().entrySet()) {
+ validateNoLoneSurrogateInString(entry.getKey());
+ validateNoLoneSurrogate(entry.getValue());
+ }
+ } else if (element.isJsonArray()) {
+ for (JsonElement child : element.getAsJsonArray()) {
+ validateNoLoneSurrogate(child);
+ }
+ }
+ }
+
+ private static void validateNoLoneSurrogateInString(String s) throws
AnalysisException {
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ if (Character.isHighSurrogate(c)) {
+ if (i + 1 >= s.length() ||
!Character.isLowSurrogate(s.charAt(i + 1))) {
+ throw new AnalysisException(
+ "Invalid jsonb literal: JSON string contains lone
high surrogate");
+ }
+ i++; // skip the paired low surrogate
+ } else if (Character.isLowSurrogate(c)) {
+ throw new AnalysisException(
+ "Invalid jsonb literal: JSON string contains lone low
surrogate");
+ }
+ }
+ }
+
protected JsonLiteral(JsonLiteral other) {
super(other);
value = other.value;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
index 4c4c7dced4c..b563b430893 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
@@ -27,6 +27,9 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import java.util.Iterator;
+import java.util.Map;
+
/**
* literal for json type.
*/
@@ -49,8 +52,42 @@ public class JsonLiteral extends Literal {
}
if (jsonNode == null || jsonNode.isMissingNode()) {
throw new AnalysisException("Invalid jsonb literal: ''");
+ }
+ validateNoLoneSurrogate(jsonNode);
+ this.value = jsonNode.toString();
+ }
+
+ // RFC 8259 §8.2: JSON strings must not contain lone UTF-16 surrogates.
+ // Jackson accepts them by default, so we validate after parsing.
+ // Both string values AND object field names are checked.
+ private static void validateNoLoneSurrogate(JsonNode node) {
+ if (node.isTextual()) {
+ validateNoLoneSurrogateInString(node.textValue());
+ } else if (node.isObject()) {
+ Iterator<Map.Entry<String, JsonNode>> fields = node.fields();
+ while (fields.hasNext()) {
+ Map.Entry<String, JsonNode> entry = fields.next();
+ validateNoLoneSurrogateInString(entry.getKey());
+ validateNoLoneSurrogate(entry.getValue());
+ }
} else {
- this.value = jsonNode.toString();
+ node.forEach(JsonLiteral::validateNoLoneSurrogate);
+ }
+ }
+
+ private static void validateNoLoneSurrogateInString(String s) {
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ if (Character.isHighSurrogate(c)) {
+ if (i + 1 >= s.length() ||
!Character.isLowSurrogate(s.charAt(i + 1))) {
+ throw new AnalysisException(
+ "Invalid jsonb literal: JSON string contains lone
high surrogate");
+ }
+ i++; // skip the paired low surrogate
+ } else if (Character.isLowSurrogate(c)) {
+ throw new AnalysisException(
+ "Invalid jsonb literal: JSON string contains lone low
surrogate");
+ }
}
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteralTest.java
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteralTest.java
new file mode 100644
index 00000000000..6e16ea9805e
--- /dev/null
+++
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteralTest.java
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.literal;
+
+import org.apache.doris.nereids.exceptions.AnalysisException;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for JsonLiteral surrogate validation (RFC 8259 §8.2).
+ */
+public class JsonLiteralTest {
+
+ // --- valid inputs ---
+
+ @Test
+ public void testValidAsciiString() {
+ // plain ASCII string in JSON is always valid
+ Assertions.assertDoesNotThrow(() -> new JsonLiteral("\"hello\""));
+ }
+
+ @Test
+ public void testValidObject() {
+ Assertions.assertDoesNotThrow(() -> new
JsonLiteral("{\"key\":\"value\"}"));
+ }
+
+ @Test
+ public void testValidArray() {
+ Assertions.assertDoesNotThrow(() -> new JsonLiteral("[1, \"abc\",
true]"));
+ }
+
+ @Test
+ public void testValidSurrogatePair() {
+ // \uD83D\uDE00 is a valid surrogate pair (U+1F600, 😀)
+ // JSON escape: "\uD83D\uDE00"
+ Assertions.assertDoesNotThrow(() -> new
JsonLiteral("\"\\uD83D\\uDE00\""));
+ }
+
+ @Test
+ public void testValidSurrogatePairInObject() {
+ Assertions.assertDoesNotThrow(() -> new
JsonLiteral("{\"emoji\":\"\\uD83D\\uDE00\"}"));
+ }
+
+ // --- lone high surrogate ---
+
+ @Test
+ public void testLoneHighSurrogateTopLevel() {
+ // "\uD800" — lone high surrogate, no paired low surrogate
+ AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+ () -> new JsonLiteral("\"\\uD800\""));
+ Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"),
+ "Expected 'lone high surrogate' in: " + ex.getMessage());
+ }
+
+ @Test
+ public void testLoneHighSurrogateInObject() {
+ AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+ () -> new JsonLiteral("{\"k\":\"\\uD800\"}"));
+ Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+ }
+
+ @Test
+ public void testLoneHighSurrogateInArray() {
+ AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+ () -> new JsonLiteral("[\"\\uD800\"]"));
+ Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+ }
+
+ @Test
+ public void testHighSurrogateFollowedByNonLow() {
+ // \uD800\u0041 — high surrogate followed by 'A', not a low surrogate
+ AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+ () -> new JsonLiteral("\"\\uD800A\""));
+ Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+ }
+
+ // --- lone low surrogate ---
+
+ @Test
+ public void testLoneLowSurrogateTopLevel() {
+ // "\uDC00" — lone low surrogate
+ AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+ () -> new JsonLiteral("\"\\uDC00\""));
+ Assertions.assertTrue(ex.getMessage().contains("lone low surrogate"),
+ "Expected 'lone low surrogate' in: " + ex.getMessage());
+ }
+
+ @Test
+ public void testLoneHighSurrogateInObjectKey() {
+ // lone surrogate in object field name must also be rejected
+ AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+ () -> new JsonLiteral("{\"\\uD800\":\"value\"}"));
+ Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+ }
+
+ @Test
+ public void testLoneLowSurrogateInObjectKey() {
+ AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+ () -> new JsonLiteral("{\"\\uDC00\":\"value\"}"));
+ Assertions.assertTrue(ex.getMessage().contains("lone low surrogate"));
+ }
+
+ @Test
+ public void testValidSurrogatePairInObjectKey() {
+ // valid surrogate pair in key must be accepted
+ Assertions.assertDoesNotThrow(() -> new
JsonLiteral("{\"\\uD83D\\uDE00\":\"ok\"}"));
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]