This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 8ee8abaa599f [SPARK-48621][SQL] Fix Like simplification in Optimizer 
for collated strings
8ee8abaa599f is described below

commit 8ee8abaa599fd6efea85018549f1ec135af319e0
Author: Uros Bojanic <[email protected]>
AuthorDate: Fri Jun 14 17:32:16 2024 +0800

    [SPARK-48621][SQL] Fix Like simplification in Optimizer for collated strings
    
    ### What changes were proposed in this pull request?
    Enable `LikeSimplification` optimizer rule for collated strings.
    
    ### Why are the changes needed?
    Optimize how `Like` expression works with collated strings and ensure 
collation awareness when replacing `Like` expressions with `StartsWith` / 
`EndsWith` / `Contains` / `EqualTo` under special conditions.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    New e2e sql tests in `CollationSQLRegexpSuite`.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #46976 from uros-db/like-simplification.
    
    Authored-by: Uros Bojanic <[email protected]>
    Signed-off-by: Kent Yao <[email protected]>
---
 .../spark/sql/catalyst/optimizer/expressions.scala | 17 +++----
 .../apache/spark/sql/CollationSQLRegexpSuite.scala | 56 ++++++++++++++++++++++
 2 files changed, 65 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 2c55e4c8fd37..2606dd2d7737 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -738,18 +738,19 @@ object LikeSimplification extends Rule[LogicalPlan] with 
PredicateHelper {
     } else {
       pattern match {
         case startsWith(prefix) =>
-          Some(StartsWith(input, Literal(prefix)))
+          Some(StartsWith(input, Literal.create(prefix, input.dataType)))
         case endsWith(postfix) =>
-          Some(EndsWith(input, Literal(postfix)))
+          Some(EndsWith(input, Literal.create(postfix, input.dataType)))
         // 'a%a' pattern is basically same with 'a%' && '%a'.
         // However, the additional `Length` condition is required to prevent 
'a' match 'a%a'.
-        case startsAndEndsWith(prefix, postfix) =>
-          Some(And(GreaterThanOrEqual(Length(input), Literal(prefix.length + 
postfix.length)),
-            And(StartsWith(input, Literal(prefix)), EndsWith(input, 
Literal(postfix)))))
+        case startsAndEndsWith(prefix, postfix) => Some(
+          And(GreaterThanOrEqual(Length(input), Literal.create(prefix.length + 
postfix.length)),
+          And(StartsWith(input, Literal.create(prefix, input.dataType)),
+            EndsWith(input, Literal.create(postfix, input.dataType)))))
         case contains(infix) =>
-          Some(Contains(input, Literal(infix)))
+          Some(Contains(input, Literal.create(infix, input.dataType)))
         case equalTo(str) =>
-          Some(EqualTo(input, Literal(str)))
+          Some(EqualTo(input, Literal.create(str, input.dataType)))
         case _ => None
       }
     }
@@ -785,7 +786,7 @@ object LikeSimplification extends Rule[LogicalPlan] with 
PredicateHelper {
 
   def apply(plan: LogicalPlan): LogicalPlan = 
plan.transformAllExpressionsWithPruning(
     _.containsPattern(LIKE_FAMLIY), ruleId) {
-    case l @ Like(input, Literal(pattern, StringType), escapeChar) =>
+    case l @ Like(input, Literal(pattern, _: StringType), escapeChar) =>
       if (pattern == null) {
         // If pattern is null, return null value directly, since "col like 
null" == null.
         Literal(null, BooleanType)
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
index 740583064279..885ed3709868 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.internal.SqlApiConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, 
StringType}
 
@@ -55,6 +57,60 @@ class CollationSQLRegexpSuite
     })
   }
 
+  test("Like simplification should work with collated strings") {
+    case class SimplifyLikeTestCase[R](collation: String, str: String, cls: 
Class[_], result: R)
+    val testCases = Seq(
+      SimplifyLikeTestCase("UTF8_BINARY", "ab%", classOf[StartsWith], false),
+      SimplifyLikeTestCase("UTF8_BINARY", "%bc", classOf[EndsWith], false),
+      SimplifyLikeTestCase("UTF8_BINARY", "a%c", classOf[And], false),
+      SimplifyLikeTestCase("UTF8_BINARY", "%b%", classOf[Contains], false),
+      SimplifyLikeTestCase("UTF8_BINARY", "abc", classOf[EqualTo], false),
+      SimplifyLikeTestCase("UTF8_LCASE", "ab%", classOf[StartsWith], true),
+      SimplifyLikeTestCase("UTF8_LCASE", "%bc", classOf[EndsWith], true),
+      SimplifyLikeTestCase("UTF8_LCASE", "a%c", classOf[And], true),
+      SimplifyLikeTestCase("UTF8_LCASE", "%b%", classOf[Contains], true),
+      SimplifyLikeTestCase("UTF8_LCASE", "abc", classOf[EqualTo], true)
+    )
+    val tableName = "T"
+    withTable(tableName) {
+      sql(s"CREATE TABLE IF NOT EXISTS $tableName(c STRING) using PARQUET")
+      sql(s"INSERT INTO $tableName(c) VALUES('ABC')")
+      testCases.foreach { t =>
+        val query = sql(s"select c collate ${t.collation} like '${t.str}' FROM 
t")
+        checkAnswer(query, Row(t.result))
+        val optimizedPlan = 
query.queryExecution.optimizedPlan.asInstanceOf[Project]
+        
assert(optimizedPlan.projectList.head.asInstanceOf[Alias].child.getClass == 
t.cls)
+      }
+    }
+  }
+
+  test("Like simplification should work with collated strings (for default 
collation)") {
+    val tableNameBinary = "T_BINARY"
+    withTable(tableNameBinary) {
+      withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_BINARY") {
+        sql(s"CREATE TABLE IF NOT EXISTS $tableNameBinary(c STRING) using 
PARQUET")
+        sql(s"INSERT INTO $tableNameBinary(c) VALUES('ABC')")
+        checkAnswer(sql(s"select c like 'ab%' FROM $tableNameBinary"), 
Row(false))
+        checkAnswer(sql(s"select c like '%bc' FROM $tableNameBinary"), 
Row(false))
+        checkAnswer(sql(s"select c like 'a%c' FROM $tableNameBinary"), 
Row(false))
+        checkAnswer(sql(s"select c like '%b%' FROM $tableNameBinary"), 
Row(false))
+        checkAnswer(sql(s"select c like 'abc' FROM $tableNameBinary"), 
Row(false))
+      }
+    }
+    val tableNameLcase = "T_LCASE"
+    withTable(tableNameLcase) {
+      withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_LCASE") {
+        sql(s"CREATE TABLE IF NOT EXISTS $tableNameLcase(c STRING) using 
PARQUET")
+        sql(s"INSERT INTO $tableNameLcase(c) VALUES('ABC')")
+        checkAnswer(sql(s"select c like 'ab%' FROM $tableNameLcase"), 
Row(true))
+        checkAnswer(sql(s"select c like '%bc' FROM $tableNameLcase"), 
Row(true))
+        checkAnswer(sql(s"select c like 'a%c' FROM $tableNameLcase"), 
Row(true))
+        checkAnswer(sql(s"select c like '%b%' FROM $tableNameLcase"), 
Row(true))
+        checkAnswer(sql(s"select c like 'abc' FROM $tableNameLcase"), 
Row(true))
+      }
+    }
+  }
+
   test("Support ILike string expression with collation") {
     // Supported collations
     case class ILikeTestCase[R](l: String, r: String, c: String, result: R)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to