This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new e86618ac5236 [SPARK-55991] Fix unicode related SQL text corruption
with parameters
e86618ac5236 is described below
commit e86618ac5236491dff6b138335ebd672202ec9b2
Author: Serge Rielau <[email protected]>
AuthorDate: Sun Mar 15 04:33:25 2026 +0800
[SPARK-55991] Fix unicode related SQL text corruption with parameters
### What changes were proposed in this pull request?
Fix parameter substitution code to be mindful of unicode supplemental
characters
### Why are the changes needed?
Emojies (and other special characters) cause corruption of the SQL text if
parameter markers are substiution due to offset issues. codepoint vs character
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Wrote new testcases
### Was this patch authored or co-authored using generative AI tooling?
YEs Claude Opus 4.6 high
Closes #54798 from srielau/emoji.
Authored-by: Serge Rielau <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit 4d7976870e4b53950e96f5cd26938c3093d6e421)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../catalyst/parser/SubstituteParamsParser.scala | 19 ++++++-----
.../parser/ParameterSubstitutionSuite.scala | 37 ++++++++++++++++++++++
2 files changed, 48 insertions(+), 8 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
index 9beead0e6487..11f6180fe35d 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
@@ -188,22 +188,25 @@ class SubstituteParamsParser extends Logging {
* Apply a list of substitutions to the SQL text.
* Inserts a space separator when a parameter is immediately preceded by a
quote
* to avoid back-to-back quotes after substitution.
+ *
+ * ANTLR's CodePointCharStream reports token positions in Unicode code
points, but
+ * Java/Scala String indices are in UTF-16 code units. Supplementary
characters
+ * (e.g. emojis) occupy 1 code point but 2 code units, so we must convert.
*/
private def applySubstitutions(sqlText: String, substitutions:
List[Substitution]): String = {
- // Sort substitutions by start position in reverse order to avoid offset
issues
val sortedSubstitutions = substitutions.sortBy(-_.start)
var result = sqlText
sortedSubstitutions.foreach { substitution =>
- val prefix = result.substring(0, substitution.start)
+ val startCU = result.offsetByCodePoints(0, substitution.start)
+ val endCU = result.offsetByCodePoints(0, substitution.end)
+ val prefix = result.substring(0, startCU)
val replacement = substitution.replacement
- val suffix = result.substring(substitution.end)
+ val suffix = result.substring(endCU)
- // Check if replacement is immediately preceded by a quote and doesn't
already
- // start with whitespace
- val needsSpace = substitution.start > 0 &&
- (result(substitution.start - 1) == '\'' || result(substitution.start -
1) == '"') &&
- replacement.nonEmpty && !replacement(0).isWhitespace
+ val needsSpace = startCU > 0 &&
+ (result.charAt(startCU - 1) == '\'' || result.charAt(startCU - 1) ==
'"') &&
+ replacement.nonEmpty && !replacement.charAt(0).isWhitespace
val space = if (needsSpace) " " else ""
result = s"$prefix$space$replacement$suffix"
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
index 83340ed1edc2..8baf3a921eaf 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
@@ -121,6 +121,43 @@ class ParameterSubstitutionSuite extends SparkFunSuite {
}
}
+ test("ParameterHandler - named parameter with emoji in SQL") {
+ val emoji = new String(Character.toChars(0x1F4AA)) // supplementary char
(2 UTF-16 code units)
+ val context = NamedParameterContext(Map("team" -> Literal("abc")))
+ val sql = s"SELECT '${emoji}' AS a FROM T WHERE :team IS NULL"
+ val (result, _) = ParameterHandler.substituteParameters(sql, context)
+ assert(result === s"SELECT '${emoji}' AS a FROM T WHERE 'abc' IS NULL")
+ }
+
+ test("ParameterHandler - positional parameter with emoji in SQL") {
+ val emoji = new String(Character.toChars(0x1F4AA))
+ val context = PositionalParameterContext(Seq(Literal("abc")))
+ val sql = s"SELECT '${emoji}' AS a FROM T WHERE ? IS NULL"
+ val (result, _) = ParameterHandler.substituteParameters(sql, context)
+ assert(result === s"SELECT '${emoji}' AS a FROM T WHERE 'abc' IS NULL")
+ }
+
+ test("ParameterHandler - multiple params with emoji in SQL and replacement
values") {
+ val flexed = new String(Character.toChars(0x1F4AA))
+ val tada = new String(Character.toChars(0x1F389))
+ val context = NamedParameterContext(Map(
+ "p1" -> Literal(tada),
+ "p2" -> Literal(42)
+ ))
+ val sql = s"SELECT '${flexed}', :p1, '${flexed}', :p2"
+ val (result, _) = ParameterHandler.substituteParameters(sql, context)
+ assert(result === s"SELECT '${flexed}', '${tada}', '${flexed}', 42")
+ }
+
+ test("ParameterHandler - positional params with multiple emojis") {
+ val flexed = new String(Character.toChars(0x1F4AA))
+ val tada = new String(Character.toChars(0x1F389))
+ val context = PositionalParameterContext(Seq(Literal(tada), Literal(99)))
+ val sql = s"SELECT '${flexed}', ?, '${flexed}${flexed}', ?"
+ val (result, _) = ParameterHandler.substituteParameters(sql, context)
+ assert(result === s"SELECT '${flexed}', '${tada}', '${flexed}${flexed}',
99")
+ }
+
test("Large parameter set") {
val largeParamMap = (1 to 100).map(i => s"param$i" -> Literal(i)).toMap
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]