This is an automated email from the ASF dual-hosted git repository.

yumwang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 72ce64ef9daf [SPARK-52817][SQL] Fix `Like` Expression performance
72ce64ef9daf is described below

commit 72ce64ef9dafec1ce6ac1b09c8129d1e0c4f4f1d
Author: zhixingheyi-tian <1564481...@qq.com>
AuthorDate: Sat Jul 19 08:07:13 2025 +0800

    [SPARK-52817][SQL] Fix `Like` Expression performance
    
    ### What changes were proposed in this pull request?
    
    Make contains function to be used in like expression with multiple '%'.
    
    ### Why are the changes needed?
    
    In some customers' cases , user sometimes use  multiple '%' for  like 
expression.
    
    For Example:
    ```
    SELECT * FROM testData where value not like '%%HotFocus%%'
    SELECT * FROM testData where value not like '%%%HotFocus%%%'
    ```
    
    In these SQL queries,  cannot convert Like expressions to contains function 
in logical planning. So the performance is very poor.
    
    ### How was this patch tested?
    
    Added UTs and Existed UTs
    
    Closes #51510 from zhixingheyi-tian/fix-like.
    
    Authored-by: zhixingheyi-tian <1564481...@qq.com>
    Signed-off-by: Yuming Wang <yumw...@ebay.com>
---
 .../spark/sql/catalyst/optimizer/expressions.scala |  9 ++--
 .../optimizer/LikeSimplificationSuite.scala        | 48 ++++++++++++++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index ad90b51d7fc3..f4944cede19a 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -743,10 +743,11 @@ object SupportedBinaryExpr {
 object LikeSimplification extends Rule[LogicalPlan] with PredicateHelper {
   // if guards below protect from escapes on trailing %.
   // Cases like "something\%" are not optimized, but this does not affect 
correctness.
-  private val startsWith = "([^_%]+)%".r
-  private val endsWith = "%([^_%]+)".r
-  private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r
-  private val contains = "%([^_%]+)%".r
+  // Consecutive wildcard characters are equivalent to a single wildcard 
character.
+  private val startsWith = "([^_%]+)%+".r
+  private val endsWith = "%+([^_%]+)".r
+  private val startsAndEndsWith = "([^_%]+)%+([^_%]+)".r
+  private val contains = "%+([^_%]+)%+".r
   private val equalTo = "([^_%]*)".r
 
   private def simplifyLike(
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
index 992170dbc0d2..e8cb2b2dd8b0 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
@@ -165,6 +165,54 @@ class LikeSimplificationSuite extends PlanTest {
     comparePlans(optimized5, correctAnswer5)
   }
 
+  test("SPARK-52817: Spark SQL LIKE expressions show poor performance when 
using multiple '%'") {
+    val originalQuery1 =
+      testRelation
+        .where($"a" like "abc%%")
+    val optimized1 = Optimize.execute(originalQuery1.analyze)
+    val correctAnswer1 = testRelation
+      .where(StartsWith($"a", "abc"))
+      .analyze
+    comparePlans(optimized1, correctAnswer1)
+
+    val originalQuery2 =
+      testRelation
+        .where($"a" like "%%xyz")
+    val optimized2 = Optimize.execute(originalQuery2.analyze)
+    val correctAnswer2 = testRelation
+      .where(EndsWith($"a", "xyz"))
+      .analyze
+    comparePlans(optimized2, correctAnswer2)
+
+    val originalQuery3 =
+      testRelation
+        .where($"a" like "abc%%def")
+    val optimized3 = Optimize.execute(originalQuery3.analyze)
+    val correctAnswer3 = testRelation
+      .where(
+        (Length($"a") >= 6 && (StartsWith($"a", "abc") && EndsWith($"a", 
"def"))))
+      .analyze
+    comparePlans(optimized3, correctAnswer3)
+
+    val originalQuery4 =
+      testRelation
+        .where(($"a" like "%%mn%%"))
+    val optimized4 = Optimize.execute(originalQuery4.analyze)
+    val correctAnswer4 = testRelation
+      .where(Contains($"a", "mn"))
+      .analyze
+    comparePlans(optimized4, correctAnswer4)
+
+    val originalQuery5 =
+      testRelation
+        .where(($"a" like "%%%mn%%%"))
+    val optimized5 = Optimize.execute(originalQuery5.analyze)
+    val correctAnswer5 = testRelation
+      .where(Contains($"a", "mn"))
+      .analyze
+    comparePlans(optimized5, correctAnswer5)
+  }
+
   test("simplify LikeAll") {
     val originalQuery =
       testRelation


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to