This is an automated email from the ASF dual-hosted git repository. yumwang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 72ce64ef9daf [SPARK-52817][SQL] Fix `Like` Expression performance 72ce64ef9daf is described below commit 72ce64ef9dafec1ce6ac1b09c8129d1e0c4f4f1d Author: zhixingheyi-tian <1564481...@qq.com> AuthorDate: Sat Jul 19 08:07:13 2025 +0800 [SPARK-52817][SQL] Fix `Like` Expression performance ### What changes were proposed in this pull request? Make contains function to be used in like expression with multiple '%'. ### Why are the changes needed? In some customers' cases , user sometimes use multiple '%' for like expression. For Example: ``` SELECT * FROM testData where value not like '%%HotFocus%%' SELECT * FROM testData where value not like '%%%HotFocus%%%' ``` In these SQL queries, cannot convert Like expressions to contains function in logical planning. So the performance is very poor. ### How was this patch tested? Added UTs and Existed UTs Closes #51510 from zhixingheyi-tian/fix-like. Authored-by: zhixingheyi-tian <1564481...@qq.com> Signed-off-by: Yuming Wang <yumw...@ebay.com> --- .../spark/sql/catalyst/optimizer/expressions.scala | 9 ++-- .../optimizer/LikeSimplificationSuite.scala | 48 ++++++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index ad90b51d7fc3..f4944cede19a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -743,10 +743,11 @@ object SupportedBinaryExpr { object LikeSimplification extends Rule[LogicalPlan] with PredicateHelper { // if guards below protect from escapes on trailing %. // Cases like "something\%" are not optimized, but this does not affect correctness. - private val startsWith = "([^_%]+)%".r - private val endsWith = "%([^_%]+)".r - private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r - private val contains = "%([^_%]+)%".r + // Consecutive wildcard characters are equivalent to a single wildcard character. + private val startsWith = "([^_%]+)%+".r + private val endsWith = "%+([^_%]+)".r + private val startsAndEndsWith = "([^_%]+)%+([^_%]+)".r + private val contains = "%+([^_%]+)%+".r private val equalTo = "([^_%]*)".r private def simplifyLike( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala index 992170dbc0d2..e8cb2b2dd8b0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala @@ -165,6 +165,54 @@ class LikeSimplificationSuite extends PlanTest { comparePlans(optimized5, correctAnswer5) } + test("SPARK-52817: Spark SQL LIKE expressions show poor performance when using multiple '%'") { + val originalQuery1 = + testRelation + .where($"a" like "abc%%") + val optimized1 = Optimize.execute(originalQuery1.analyze) + val correctAnswer1 = testRelation + .where(StartsWith($"a", "abc")) + .analyze + comparePlans(optimized1, correctAnswer1) + + val originalQuery2 = + testRelation + .where($"a" like "%%xyz") + val optimized2 = Optimize.execute(originalQuery2.analyze) + val correctAnswer2 = testRelation + .where(EndsWith($"a", "xyz")) + .analyze + comparePlans(optimized2, correctAnswer2) + + val originalQuery3 = + testRelation + .where($"a" like "abc%%def") + val optimized3 = Optimize.execute(originalQuery3.analyze) + val correctAnswer3 = testRelation + .where( + (Length($"a") >= 6 && (StartsWith($"a", "abc") && EndsWith($"a", "def")))) + .analyze + comparePlans(optimized3, correctAnswer3) + + val originalQuery4 = + testRelation + .where(($"a" like "%%mn%%")) + val optimized4 = Optimize.execute(originalQuery4.analyze) + val correctAnswer4 = testRelation + .where(Contains($"a", "mn")) + .analyze + comparePlans(optimized4, correctAnswer4) + + val originalQuery5 = + testRelation + .where(($"a" like "%%%mn%%%")) + val optimized5 = Optimize.execute(originalQuery5.analyze) + val correctAnswer5 = testRelation + .where(Contains($"a", "mn")) + .analyze + comparePlans(optimized5, correctAnswer5) + } + test("simplify LikeAll") { val originalQuery = testRelation --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org