This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7c7c1965878b [SPARK-48712][SQL] Perf Improvement for encode with empty
values or UTF-8 charset
7c7c1965878b is described below
commit 7c7c1965878b2f8346696e0b36629f06ac776d11
Author: Kent Yao <[email protected]>
AuthorDate: Thu Jun 27 10:20:18 2024 +0800
[SPARK-48712][SQL] Perf Improvement for encode with empty values or UTF-8
charset
### What changes were proposed in this pull request?
This PR makes a short-circuit, which gets the underlying byte array
directly and bypasses the encoding progress, for 'encoding UTF8String instances
w/ UTF-8 charset' or 'UTF8String.EMPTY_STRING w/ any charset'.
### Why are the changes needed?
Performance improvement, 10x~20x according to benchmark results
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
new unit tests
benchmark tests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #47096 from yaooqinn/SPARK-48712.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../catalyst/expressions/stringExpressions.scala | 3 +
.../benchmarks/EncodeBenchmark-jdk21-results.txt | 8 +++
sql/core/benchmarks/EncodeBenchmark-results.txt | 8 +++
.../analyzer-results/ansi/string-functions.sql.out | 35 +++++++++++
.../analyzer-results/string-functions.sql.out | 35 +++++++++++
.../sql-tests/inputs/string-functions.sql | 5 ++
.../results/ansi/string-functions.sql.out | 40 ++++++++++++
.../sql-tests/results/string-functions.sql.out | 40 ++++++++++++
.../sql/execution/benchmark/EncodeBenchmark.scala | 73 ++++++++++++++++++++++
9 files changed, 247 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 707363e4b682..e951d40d4d46 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -3010,6 +3010,9 @@ object Encode {
legacyCharsets: Boolean,
legacyErrorAction: Boolean): Array[Byte] = {
val toCharset = charset.toString
+ if (input.numBytes == 0 || "UTF-8".equalsIgnoreCase(toCharset)) {
+ return input.getBytes
+ }
if (legacyCharsets ||
VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) {
val encoder = try {
val codingErrorAction = if (legacyErrorAction) {
diff --git a/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt
b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..0a6164bc652e
--- /dev/null
+++ b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt
@@ -0,0 +1,8 @@
+OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure
+AMD EPYC 7763 64-Core Processor
+encode: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF-32 47469 47482
19 0.2 4746.9 1.0X
+UTF-16 57463 57487
35 0.2 5746.3 0.8X
+UTF-8 2803 2805
3 3.6 280.3 16.9X
+
diff --git a/sql/core/benchmarks/EncodeBenchmark-results.txt
b/sql/core/benchmarks/EncodeBenchmark-results.txt
new file mode 100644
index 000000000000..404138db7d36
--- /dev/null
+++ b/sql/core/benchmarks/EncodeBenchmark-results.txt
@@ -0,0 +1,8 @@
+OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure
+AMD EPYC 7763 64-Core Processor
+encode: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF-32 31107 31205
138 0.3 3110.7 1.0X
+UTF-16 47904 47934
43 0.2 4790.4 0.6X
+UTF-8 2957 2978
30 3.4 295.7 10.5X
+
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
index c4f002d84ea6..98664dedf820 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
@@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- LocalRelation [scol#x, ecol#x]
+-- !query
+select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16),
UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16),
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16),
UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16),
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16),
UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark,
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS
encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας
μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16),
UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。,
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
-- !query
select decode()
-- !query analysis
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
index c4f002d84ea6..98664dedf820 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
@@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- LocalRelation [scol#x, ecol#x]
+-- !query
+select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16),
UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16),
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16),
UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16),
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16),
UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark,
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS
encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας
μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16),
UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。,
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
-- !query
select decode()
-- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 256b8e0d49fa..c108f7c76f76 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -132,6 +132,11 @@ select encode(scol, ecol) from values('渭城朝雨浥轻尘',
'US-ASCII') as t(
set spark.sql.legacy.codingErrorAction=false;
select encode('客舍青青柳色新', 'US-ASCII');
select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol);
+select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'),
'UTF-8');
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'),
'UTF-8');
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'),
'UTF-16'), 'UTF-8');
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8');
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'),
'UTF-16'), 'UTF-8');
-- decode
select decode();
diff --git
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
index 24d4cfa74b5a..da2fa9ca0c18 100644
---
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -967,6 +967,46 @@ org.apache.spark.SparkRuntimeException
}
+-- !query
+select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16),
UTF-8):binary>
+-- !query output
+白日依山尽,黄河入海流。欲穷千里目,更上一层楼。
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16),
UTF-8):binary>
+-- !query output
+南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16),
UTF-16), UTF-8):binary>
+-- !query output
+세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8):binary>
+-- !query output
+το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων
παγκοσμίως
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16),
UTF-16), UTF-8):binary>
+-- !query output
+Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。
+
+
-- !query
select decode()
-- !query schema
diff --git
a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index 53f516dac03c..d42c387c8057 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -899,6 +899,46 @@ org.apache.spark.SparkRuntimeException
}
+-- !query
+select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16),
UTF-8):binary>
+-- !query output
+白日依山尽,黄河入海流。欲穷千里目,更上一层楼。
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'),
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16),
UTF-8):binary>
+-- !query output
+南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16),
UTF-16), UTF-8):binary>
+-- !query output
+세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8):binary>
+-- !query output
+το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων
παγκοσμίως
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'),
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16),
UTF-16), UTF-8):binary>
+-- !query output
+Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。
+
+
-- !query
select decode()
-- !query schema
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala
new file mode 100644
index 000000000000..76ebd7f41677
--- /dev/null
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+
+/**
+ * Benchmark for encode
+ * To run this benchmark:
+ * {{{
+ * 1. without sbt:
+ * bin/spark-submit --class <this class> --jars <spark core test jar>
<sql core test jar>
+ * 2. build/sbt "sql/Test/runMain <this class>"
+ * 3. generate result:
+ * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this
class>"
+ * Results will be written to "benchmarks/EncodeBenchmark-results.txt".
+ * }}}
+ */
+object EncodeBenchmark extends SqlBasedBenchmark {
+ import spark.implicits._
+ private val N = 10L * 1000 * 1000
+
+ override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+ withTempPath { path =>
+ // scalastyle:off nonascii
+ val exprs = Seq(
+ "",
+ "Spark",
+ "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。",
+ "το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων
δεδομένων παγκοσμίως",
+ "세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark",
+ "Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。")
+ // scalastyle:off nonascii
+
+ spark.range(N).map { i =>
+ val idx = (i % 6).toInt
+ val str = exprs(idx)
+ (str, str * 3, str * 5, str * 9, "")
+ }.write.parquet(path.getCanonicalPath)
+
+ val benchmark = new Benchmark("encode", N, output = output)
+ def addBenchmarkCase(charset: String): Unit = {
+ benchmark.addCase(charset) { _ =>
+ spark.read.parquet(path.getCanonicalPath).selectExpr(
+ s"encode(_1, '$charset')",
+ s"encode(_2, '$charset')",
+ s"encode(_3, '$charset')",
+ s"encode(_4, '$charset')",
+ s"encode(_5, '$charset')").noop()
+ }
+ }
+ addBenchmarkCase("UTF-32")
+ addBenchmarkCase("UTF-16")
+ addBenchmarkCase("UTF-8")
+ benchmark.run()
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]