(spark) branch master updated: [SPARK-48712][SQL] Perf Improvement for encode with empty values or UTF-8 charset

yao Wed, 26 Jun 2024 19:20:37 -0700

This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 7c7c1965878b [SPARK-48712][SQL] Perf Improvement for encode with empty 
values or UTF-8 charset
7c7c1965878b is described below

commit 7c7c1965878b2f8346696e0b36629f06ac776d11
Author: Kent Yao <[email protected]>
AuthorDate: Thu Jun 27 10:20:18 2024 +0800

    [SPARK-48712][SQL] Perf Improvement for encode with empty values or UTF-8 
charset
    
    ### What changes were proposed in this pull request?
    
    This PR makes a short-circuit, which gets the underlying byte array 
directly and bypasses the encoding progress, for 'encoding UTF8String instances 
w/ UTF-8 charset' or 'UTF8String.EMPTY_STRING w/ any charset'.
    
    ### Why are the changes needed?
    
    Performance improvement, 10x~20x according to benchmark results
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    new unit tests
    benchmark tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #47096 from yaooqinn/SPARK-48712.
    
    Authored-by: Kent Yao <[email protected]>
    Signed-off-by: Kent Yao <[email protected]>
---
 .../catalyst/expressions/stringExpressions.scala   |  3 +
 .../benchmarks/EncodeBenchmark-jdk21-results.txt   |  8 +++
 sql/core/benchmarks/EncodeBenchmark-results.txt    |  8 +++
 .../analyzer-results/ansi/string-functions.sql.out | 35 +++++++++++
 .../analyzer-results/string-functions.sql.out      | 35 +++++++++++
 .../sql-tests/inputs/string-functions.sql          |  5 ++
 .../results/ansi/string-functions.sql.out          | 40 ++++++++++++
 .../sql-tests/results/string-functions.sql.out     | 40 ++++++++++++
 .../sql/execution/benchmark/EncodeBenchmark.scala  | 73 ++++++++++++++++++++++
 9 files changed, 247 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 707363e4b682..e951d40d4d46 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -3010,6 +3010,9 @@ object Encode {
       legacyCharsets: Boolean,
       legacyErrorAction: Boolean): Array[Byte] = {
     val toCharset = charset.toString
+    if (input.numBytes == 0 || "UTF-8".equalsIgnoreCase(toCharset)) {
+      return input.getBytes
+    }
     if (legacyCharsets || 
VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) {
       val encoder = try {
         val codingErrorAction = if (legacyErrorAction) {
diff --git a/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..0a6164bc652e
--- /dev/null
+++ b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt
@@ -0,0 +1,8 @@
+OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure
+AMD EPYC 7763 64-Core Processor
+encode:                                   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF-32                                            47469          47482         
 19          0.2        4746.9       1.0X
+UTF-16                                            57463          57487         
 35          0.2        5746.3       0.8X
+UTF-8                                              2803           2805         
  3          3.6         280.3      16.9X
+
diff --git a/sql/core/benchmarks/EncodeBenchmark-results.txt 
b/sql/core/benchmarks/EncodeBenchmark-results.txt
new file mode 100644
index 000000000000..404138db7d36
--- /dev/null
+++ b/sql/core/benchmarks/EncodeBenchmark-results.txt
@@ -0,0 +1,8 @@
+OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure
+AMD EPYC 7763 64-Core Processor
+encode:                                   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF-32                                            31107          31205         
138          0.3        3110.7       1.0X
+UTF-16                                            47904          47934         
 43          0.2        4790.4       0.6X
+UTF-8                                              2957           2978         
 30          3.4         295.7      10.5X
+
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
index c4f002d84ea6..98664dedf820 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
@@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
    +- LocalRelation [scol#x, ecol#x]
 
 
+-- !query
+select encode(decode(encode('白日依山尽，黄河入海流。欲穷千里目，更上一层楼。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(白日依山尽，黄河入海流。欲穷千里目，更上一层楼。, UTF-16), UTF-16), 
UTF-8) AS encode(decode(encode(白日依山尽，黄河入海流。欲穷千里目，更上一层楼。, UTF-16), UTF-16), 
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。, UTF-16), UTF-16), 
UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。, UTF-16), UTF-16), 
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), 
UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, 
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS 
encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας 
μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), 
UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, 
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
 -- !query
 select decode()
 -- !query analysis
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
index c4f002d84ea6..98664dedf820 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
@@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
    +- LocalRelation [scol#x, ecol#x]
 
 
+-- !query
+select encode(decode(encode('白日依山尽，黄河入海流。欲穷千里目，更上一层楼。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(白日依山尽，黄河入海流。欲穷千里目，更上一层楼。, UTF-16), UTF-16), 
UTF-8) AS encode(decode(encode(白日依山尽，黄河入海流。欲穷千里目，更上一层楼。, UTF-16), UTF-16), 
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。, UTF-16), UTF-16), 
UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。, UTF-16), UTF-16), 
UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), 
UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, 
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS 
encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας 
μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query analysis
+Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), 
UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, 
UTF-16), UTF-16), UTF-8)#x]
++- OneRowRelation
+
+
 -- !query
 select decode()
 -- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql 
b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 256b8e0d49fa..c108f7c76f76 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -132,6 +132,11 @@ select encode(scol, ecol) from values('渭城朝雨浥轻尘', 
'US-ASCII') as t(
 set spark.sql.legacy.codingErrorAction=false;
 select encode('客舍青青柳色新', 'US-ASCII');
 select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol);
+select encode(decode(encode('白日依山尽，黄河入海流。欲穷千里目，更上一层楼。', 'UTF-16'), 'UTF-16'), 
'UTF-8');
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。', 'UTF-16'), 'UTF-16'), 
'UTF-8');
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 
'UTF-16'), 'UTF-8');
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8');
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 
'UTF-16'), 'UTF-8');
 
 -- decode
 select decode();
diff --git 
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
index 24d4cfa74b5a..da2fa9ca0c18 100644
--- 
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -967,6 +967,46 @@ org.apache.spark.SparkRuntimeException
 }
 
 
+-- !query
+select encode(decode(encode('白日依山尽，黄河入海流。欲穷千里目，更上一层楼。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(白日依山尽，黄河入海流。欲穷千里目，更上一层楼。, UTF-16), UTF-16), 
UTF-8):binary>
+-- !query output
+白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。, UTF-16), UTF-16), 
UTF-8):binary>
+-- !query output
+南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), 
UTF-16), UTF-8):binary>
+-- !query output
+세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8):binary>
+-- !query output
+το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων 
παγκοσμίως
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), 
UTF-16), UTF-8):binary>
+-- !query output
+Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。
+
+
 -- !query
 select decode()
 -- !query schema
diff --git 
a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index 53f516dac03c..d42c387c8057 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -899,6 +899,46 @@ org.apache.spark.SparkRuntimeException
 }
 
 
+-- !query
+select encode(decode(encode('白日依山尽，黄河入海流。欲穷千里目，更上一层楼。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(白日依山尽，黄河入海流。欲穷千里目，更上一层楼。, UTF-16), UTF-16), 
UTF-8):binary>
+-- !query output
+白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
+
+
+-- !query
+select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。', 'UTF-16'), 'UTF-16'), 
'UTF-8')
+-- !query schema
+struct<encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。, UTF-16), UTF-16), 
UTF-8):binary>
+-- !query output
+南山經之首曰䧿山。其首曰招搖之山，臨於西海之上。
+
+
+-- !query
+select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), 
UTF-16), UTF-8):binary>
+-- !query output
+세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark
+
+
+-- !query
+select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο 
επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8):binary>
+-- !query output
+το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων 
παγκοσμίως
+
+
+-- !query
+select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 
'UTF-16'), 'UTF-8')
+-- !query schema
+struct<encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), 
UTF-16), UTF-8):binary>
+-- !query output
+Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。
+
+
 -- !query
 select decode()
 -- !query schema
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala
new file mode 100644
index 000000000000..76ebd7f41677
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import org.apache.spark.benchmark.Benchmark
+
+/**
+ * Benchmark for encode
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt:
+ *      bin/spark-submit --class <this class> --jars <spark core test jar> 
<sql core test jar>
+ *   2. build/sbt "sql/Test/runMain <this class>"
+ *   3. generate result:
+ *      SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this 
class>"
+ *      Results will be written to "benchmarks/EncodeBenchmark-results.txt".
+ * }}}
+ */
+object EncodeBenchmark extends SqlBasedBenchmark {
+  import spark.implicits._
+  private val N = 10L * 1000 * 1000
+
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    withTempPath { path =>
+      // scalastyle:off nonascii
+      val exprs = Seq(
+        "",
+        "Spark",
+        "白日依山尽，黄河入海流。欲穷千里目，更上一层楼。",
+        "το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων 
δεδομένων παγκοσμίως",
+        "세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark",
+        "Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。")
+      // scalastyle:off nonascii
+
+      spark.range(N).map { i =>
+        val idx = (i % 6).toInt
+        val str = exprs(idx)
+        (str, str * 3, str * 5, str * 9, "")
+      }.write.parquet(path.getCanonicalPath)
+
+      val benchmark = new Benchmark("encode", N, output = output)
+      def addBenchmarkCase(charset: String): Unit = {
+        benchmark.addCase(charset) { _ =>
+          spark.read.parquet(path.getCanonicalPath).selectExpr(
+            s"encode(_1, '$charset')",
+            s"encode(_2, '$charset')",
+            s"encode(_3, '$charset')",
+            s"encode(_4, '$charset')",
+            s"encode(_5, '$charset')").noop()
+        }
+      }
+      addBenchmarkCase("UTF-32")
+      addBenchmarkCase("UTF-16")
+      addBenchmarkCase("UTF-8")
+      benchmark.run()
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-48712][SQL] Perf Improvement for encode with empty values or UTF-8 charset

Reply via email to