(spark) branch master updated: [SPARK-50169][SQL] Improve performance of `RegExpReplace`

gurwls223 Wed, 30 Oct 2024 23:56:22 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 77630e9f058a [SPARK-50169][SQL] Improve performance of `RegExpReplace`
77630e9f058a is described below

commit 77630e9f058a096362c5afdd7d1e5b4bb652a3f9
Author: panbingkun <[email protected]>
AuthorDate: Thu Oct 31 15:47:55 2024 +0900

    [SPARK-50169][SQL] Improve performance of `RegExpReplace`
    
    ### What changes were proposed in this pull request?
    The pr aims to replace `StringBuffer` with `StringBuilder` in 
`RegExpReplace`(`regexp_replace`) to improve performance. (PS: `Performance 
data` can be found in the comments below)
    
    ### Why are the changes needed?
    In our scenario, there is no need to use thread safe `StringBuffer`.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Pass GA.
    Manually check.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #48702 from panbingkun/SPARK-50169.
    
    Authored-by: panbingkun <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../catalyst/expressions/regexpExpressions.scala   |  7 +--
 .../StringFunctionsBenchmark-jdk21-results.txt     | 11 ++++
 .../StringFunctionsBenchmark-results.txt           | 11 ++++
 .../benchmark/StringFunctionsBenchmark.scala       | 58 ++++++++++++++++++++++
 4 files changed, 84 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 5d4994ff007d..4ace3dc95a43 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.lang.{StringBuilder => JStringBuilder}
 import java.util.Locale
 import java.util.regex.{Matcher, MatchResult, Pattern, PatternSyntaxException}
 
@@ -681,7 +682,7 @@ case class RegExpReplace(subject: Expression, regexp: 
Expression, rep: Expressio
   @transient private var lastReplacement: String = _
   @transient private var lastReplacementInUTF8: UTF8String = _
   // result buffer write by Matcher
-  @transient private lazy val result: StringBuffer = new StringBuffer
+  @transient private lazy val result: JStringBuilder = new JStringBuilder
   final override val nodePatterns: Seq[TreePattern] = Seq(REGEXP_REPLACE)
 
   override def nullSafeEval(s: Any, p: Any, r: Any, i: Any): Any = {
@@ -727,7 +728,7 @@ case class RegExpReplace(subject: Expression, regexp: 
Expression, rep: Expressio
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
     val termResult = ctx.freshName("termResult")
 
-    val classNameStringBuffer = 
classOf[java.lang.StringBuffer].getCanonicalName
+    val classNameStringBuilder = classOf[JStringBuilder].getCanonicalName
 
     val matcher = ctx.freshName("matcher")
     val source = ctx.freshName("source")
@@ -753,7 +754,7 @@ case class RegExpReplace(subject: Expression, regexp: 
Expression, rep: Expressio
       String $source = $subject.toString();
       int $position = $pos - 1;
       if ($position == 0 || $position < $source.length()) {
-        $classNameStringBuffer $termResult = new $classNameStringBuffer();
+        $classNameStringBuilder $termResult = new $classNameStringBuilder();
         $matcher.region($position, $source.length());
 
         while ($matcher.find()) {
diff --git a/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..04720fb50b41
--- /dev/null
+++ b/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt
@@ -0,0 +1,11 @@
+================================================================================================
+SQL string functions
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
+AMD EPYC 7763 64-Core Processor
+regexp_replace:                           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+regexp_replace('*-*', '(\\d+)', 'num')              503            527         
 32          2.0         503.0       1.0X
+
+
diff --git a/sql/core/benchmarks/StringFunctionsBenchmark-results.txt 
b/sql/core/benchmarks/StringFunctionsBenchmark-results.txt
new file mode 100644
index 000000000000..c1b9bdb4ea3d
--- /dev/null
+++ b/sql/core/benchmarks/StringFunctionsBenchmark-results.txt
@@ -0,0 +1,11 @@
+================================================================================================
+SQL string functions
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
+AMD EPYC 7763 64-Core Processor
+regexp_replace:                           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+regexp_replace('*-*', '(\\d+)', 'num')              512            529         
 24          2.0         512.0       1.0X
+
+
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/StringFunctionsBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/StringFunctionsBenchmark.scala
new file mode 100644
index 000000000000..64cb434f0942
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/StringFunctionsBenchmark.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.functions.{col, concat, lit}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Synthetic benchmark for the string functions.
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt:
+ *      bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar> <sql core 
test jar>
+ *   2. build/sbt "sql/Test/runMain <this class>"
+ *   3. generate result:
+ *      SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this 
class>"
+ *      Results will be written to 
"benchmarks/StringFunctionsBenchmark-results.txt".
+ * }}}
+ */
+object StringFunctionsBenchmark extends SqlBasedBenchmark {
+  private val N = 10_000_00
+  private val M = 100
+
+  private val df = spark.range(N).to(new StructType().add("id", "int")).
+    withColumn("subject", concat(col("id"), lit("-"), col("id") * 2)).
+    withColumn("regexp", lit("(\\d+)")).
+    withColumn("rep", lit("num"))
+
+  private def doRegexpReplaceBenchmark(): Unit = {
+    df.selectExpr("regexp_replace(subject, regexp, rep)").noop()
+  }
+
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    runBenchmark("SQL string functions") {
+      val benchmark = new Benchmark("regexp_replace", N, output = output)
+      benchmark.addCase("""regexp_replace('*-*', '(\\d+)', 'num')""", M) { _ =>
+        doRegexpReplaceBenchmark()
+      }
+      benchmark.run()
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50169][SQL] Improve performance of `RegExpReplace`

Reply via email to