This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new e86618ac5236 [SPARK-55991] Fix unicode related SQL text corruption 
with parameters
e86618ac5236 is described below

commit e86618ac5236491dff6b138335ebd672202ec9b2
Author: Serge Rielau <[email protected]>
AuthorDate: Sun Mar 15 04:33:25 2026 +0800

    [SPARK-55991] Fix unicode related SQL text corruption with parameters
    
    ### What changes were proposed in this pull request?
    
    Fix parameter substitution code to be mindful of unicode supplemental 
characters
    
    ### Why are the changes needed?
    
    Emojies (and other special characters) cause corruption of the SQL text if 
parameter markers are substiution due to offset issues. codepoint vs character
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Wrote new testcases
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    YEs Claude Opus 4.6 high
    
    Closes #54798 from srielau/emoji.
    
    Authored-by: Serge Rielau <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
    (cherry picked from commit 4d7976870e4b53950e96f5cd26938c3093d6e421)
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../catalyst/parser/SubstituteParamsParser.scala   | 19 ++++++-----
 .../parser/ParameterSubstitutionSuite.scala        | 37 ++++++++++++++++++++++
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
index 9beead0e6487..11f6180fe35d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala
@@ -188,22 +188,25 @@ class SubstituteParamsParser extends Logging {
    * Apply a list of substitutions to the SQL text.
    * Inserts a space separator when a parameter is immediately preceded by a 
quote
    * to avoid back-to-back quotes after substitution.
+   *
+   * ANTLR's CodePointCharStream reports token positions in Unicode code 
points, but
+   * Java/Scala String indices are in UTF-16 code units. Supplementary 
characters
+   * (e.g. emojis) occupy 1 code point but 2 code units, so we must convert.
    */
   private def applySubstitutions(sqlText: String, substitutions: 
List[Substitution]): String = {
-    // Sort substitutions by start position in reverse order to avoid offset 
issues
     val sortedSubstitutions = substitutions.sortBy(-_.start)
 
     var result = sqlText
     sortedSubstitutions.foreach { substitution =>
-      val prefix = result.substring(0, substitution.start)
+      val startCU = result.offsetByCodePoints(0, substitution.start)
+      val endCU = result.offsetByCodePoints(0, substitution.end)
+      val prefix = result.substring(0, startCU)
       val replacement = substitution.replacement
-      val suffix = result.substring(substitution.end)
+      val suffix = result.substring(endCU)
 
-      // Check if replacement is immediately preceded by a quote and doesn't 
already
-      // start with whitespace
-      val needsSpace = substitution.start > 0 &&
-        (result(substitution.start - 1) == '\'' || result(substitution.start - 
1) == '"') &&
-        replacement.nonEmpty && !replacement(0).isWhitespace
+      val needsSpace = startCU > 0 &&
+        (result.charAt(startCU - 1) == '\'' || result.charAt(startCU - 1) == 
'"') &&
+        replacement.nonEmpty && !replacement.charAt(0).isWhitespace
 
       val space = if (needsSpace) " " else ""
       result = s"$prefix$space$replacement$suffix"
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
index 83340ed1edc2..8baf3a921eaf 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala
@@ -121,6 +121,43 @@ class ParameterSubstitutionSuite extends SparkFunSuite {
     }
   }
 
+  test("ParameterHandler - named parameter with emoji in SQL") {
+    val emoji = new String(Character.toChars(0x1F4AA)) // supplementary char 
(2 UTF-16 code units)
+    val context = NamedParameterContext(Map("team" -> Literal("abc")))
+    val sql = s"SELECT '${emoji}' AS a FROM T WHERE :team IS NULL"
+    val (result, _) = ParameterHandler.substituteParameters(sql, context)
+    assert(result === s"SELECT '${emoji}' AS a FROM T WHERE 'abc' IS NULL")
+  }
+
+  test("ParameterHandler - positional parameter with emoji in SQL") {
+    val emoji = new String(Character.toChars(0x1F4AA))
+    val context = PositionalParameterContext(Seq(Literal("abc")))
+    val sql = s"SELECT '${emoji}' AS a FROM T WHERE ? IS NULL"
+    val (result, _) = ParameterHandler.substituteParameters(sql, context)
+    assert(result === s"SELECT '${emoji}' AS a FROM T WHERE 'abc' IS NULL")
+  }
+
+  test("ParameterHandler - multiple params with emoji in SQL and replacement 
values") {
+    val flexed = new String(Character.toChars(0x1F4AA))
+    val tada = new String(Character.toChars(0x1F389))
+    val context = NamedParameterContext(Map(
+      "p1" -> Literal(tada),
+      "p2" -> Literal(42)
+    ))
+    val sql = s"SELECT '${flexed}', :p1, '${flexed}', :p2"
+    val (result, _) = ParameterHandler.substituteParameters(sql, context)
+    assert(result === s"SELECT '${flexed}', '${tada}', '${flexed}', 42")
+  }
+
+  test("ParameterHandler - positional params with multiple emojis") {
+    val flexed = new String(Character.toChars(0x1F4AA))
+    val tada = new String(Character.toChars(0x1F389))
+    val context = PositionalParameterContext(Seq(Literal(tada), Literal(99)))
+    val sql = s"SELECT '${flexed}', ?, '${flexed}${flexed}', ?"
+    val (result, _) = ParameterHandler.substituteParameters(sql, context)
+    assert(result === s"SELECT '${flexed}', '${tada}', '${flexed}${flexed}', 
99")
+  }
+
   test("Large parameter set") {
 
     val largeParamMap = (1 to 100).map(i => s"param$i" -> Literal(i)).toMap


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to