This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 77630e9f058a [SPARK-50169][SQL] Improve performance of `RegExpReplace`
77630e9f058a is described below
commit 77630e9f058a096362c5afdd7d1e5b4bb652a3f9
Author: panbingkun <[email protected]>
AuthorDate: Thu Oct 31 15:47:55 2024 +0900
[SPARK-50169][SQL] Improve performance of `RegExpReplace`
### What changes were proposed in this pull request?
The pr aims to replace `StringBuffer` with `StringBuilder` in
`RegExpReplace`(`regexp_replace`) to improve performance. (PS: `Performance
data` can be found in the comments below)
### Why are the changes needed?
In our scenario, there is no need to use thread safe `StringBuffer`.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Pass GA.
Manually check.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #48702 from panbingkun/SPARK-50169.
Authored-by: panbingkun <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../catalyst/expressions/regexpExpressions.scala | 7 +--
.../StringFunctionsBenchmark-jdk21-results.txt | 11 ++++
.../StringFunctionsBenchmark-results.txt | 11 ++++
.../benchmark/StringFunctionsBenchmark.scala | 58 ++++++++++++++++++++++
4 files changed, 84 insertions(+), 3 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 5d4994ff007d..4ace3dc95a43 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -17,6 +17,7 @@
package org.apache.spark.sql.catalyst.expressions
+import java.lang.{StringBuilder => JStringBuilder}
import java.util.Locale
import java.util.regex.{Matcher, MatchResult, Pattern, PatternSyntaxException}
@@ -681,7 +682,7 @@ case class RegExpReplace(subject: Expression, regexp:
Expression, rep: Expressio
@transient private var lastReplacement: String = _
@transient private var lastReplacementInUTF8: UTF8String = _
// result buffer write by Matcher
- @transient private lazy val result: StringBuffer = new StringBuffer
+ @transient private lazy val result: JStringBuilder = new JStringBuilder
final override val nodePatterns: Seq[TreePattern] = Seq(REGEXP_REPLACE)
override def nullSafeEval(s: Any, p: Any, r: Any, i: Any): Any = {
@@ -727,7 +728,7 @@ case class RegExpReplace(subject: Expression, regexp:
Expression, rep: Expressio
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode):
ExprCode = {
val termResult = ctx.freshName("termResult")
- val classNameStringBuffer =
classOf[java.lang.StringBuffer].getCanonicalName
+ val classNameStringBuilder = classOf[JStringBuilder].getCanonicalName
val matcher = ctx.freshName("matcher")
val source = ctx.freshName("source")
@@ -753,7 +754,7 @@ case class RegExpReplace(subject: Expression, regexp:
Expression, rep: Expressio
String $source = $subject.toString();
int $position = $pos - 1;
if ($position == 0 || $position < $source.length()) {
- $classNameStringBuffer $termResult = new $classNameStringBuffer();
+ $classNameStringBuilder $termResult = new $classNameStringBuilder();
$matcher.region($position, $source.length());
while ($matcher.find()) {
diff --git a/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt
b/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..04720fb50b41
--- /dev/null
+++ b/sql/core/benchmarks/StringFunctionsBenchmark-jdk21-results.txt
@@ -0,0 +1,11 @@
+================================================================================================
+SQL string functions
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
+AMD EPYC 7763 64-Core Processor
+regexp_replace: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+regexp_replace('*-*', '(\\d+)', 'num') 503 527
32 2.0 503.0 1.0X
+
+
diff --git a/sql/core/benchmarks/StringFunctionsBenchmark-results.txt
b/sql/core/benchmarks/StringFunctionsBenchmark-results.txt
new file mode 100644
index 000000000000..c1b9bdb4ea3d
--- /dev/null
+++ b/sql/core/benchmarks/StringFunctionsBenchmark-results.txt
@@ -0,0 +1,11 @@
+================================================================================================
+SQL string functions
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
+AMD EPYC 7763 64-Core Processor
+regexp_replace: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+regexp_replace('*-*', '(\\d+)', 'num') 512 529
24 2.0 512.0 1.0X
+
+
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/StringFunctionsBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/StringFunctionsBenchmark.scala
new file mode 100644
index 000000000000..64cb434f0942
--- /dev/null
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/StringFunctionsBenchmark.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.functions.{col, concat, lit}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Synthetic benchmark for the string functions.
+ * To run this benchmark:
+ * {{{
+ * 1. without sbt:
+ * bin/spark-submit --class <this class>
+ * --jars <spark core test jar>,<spark catalyst test jar> <sql core
test jar>
+ * 2. build/sbt "sql/Test/runMain <this class>"
+ * 3. generate result:
+ * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this
class>"
+ * Results will be written to
"benchmarks/StringFunctionsBenchmark-results.txt".
+ * }}}
+ */
+object StringFunctionsBenchmark extends SqlBasedBenchmark {
+ private val N = 10_000_00
+ private val M = 100
+
+ private val df = spark.range(N).to(new StructType().add("id", "int")).
+ withColumn("subject", concat(col("id"), lit("-"), col("id") * 2)).
+ withColumn("regexp", lit("(\\d+)")).
+ withColumn("rep", lit("num"))
+
+ private def doRegexpReplaceBenchmark(): Unit = {
+ df.selectExpr("regexp_replace(subject, regexp, rep)").noop()
+ }
+
+ override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+ runBenchmark("SQL string functions") {
+ val benchmark = new Benchmark("regexp_replace", N, output = output)
+ benchmark.addCase("""regexp_replace('*-*', '(\\d+)', 'num')""", M) { _ =>
+ doRegexpReplaceBenchmark()
+ }
+ benchmark.run()
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]