This is an automated email from the ASF dual-hosted git repository.
dbtsai pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new f499f45 [SPARK-31364][SQL][TESTS] Benchmark Parquet Nested Field
Predicate Pushdown
f499f45 is described below
commit f499f456d7318dc287b628b1d47916c9e91bedf4
Author: Jian Tang <[email protected]>
AuthorDate: Fri Apr 24 22:10:58 2020 +0000
[SPARK-31364][SQL][TESTS] Benchmark Parquet Nested Field Predicate Pushdown
### What changes were proposed in this pull request?
This PR aims to add a benchmark suite for nested predicate pushdown with
parquet file:
Performance comparison: Nested predicate pushdown disabled vs enabled,
with the following queries scenarios:
1. When predicate pushed down, parquet reader are able to filter out all
the row groups without loading them.
2. When predicate pushed down, parquet reader only loads one of the row
groups.
3. When predicate pushed down, parquet reader can't filter out any row
group in order to see if we introduce too much overhead or not when enabling
nested predicate push down.
### Why are the changes needed?
No benchmark exists today for nested fields predicate pushdown performance
evaluation.
### Does this PR introduce any user-facing change?
No
### How was this patch tested?
Benchmark runs and reporting result.
Closes #28319 from JiJiTang/SPARK-31364.
Authored-by: Jian Tang <[email protected]>
Signed-off-by: DB Tsai <[email protected]>
(cherry picked from commit 6a576161ae3a113c2e329595b9a6f56d4210e0cf)
Signed-off-by: DB Tsai <[email protected]>
---
...tedPredicatePushDownBenchmark-jdk11-results.txt | 21 ++++
...uetNestedPredicatePushDownBenchmark-results.txt | 21 ++++
.../ParquetNestedPredicatePushDownBenchmark.scala | 113 +++++++++++++++++++++
3 files changed, 155 insertions(+)
diff --git
a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk11-results.txt
b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk11-results.txt
new file mode 100644
index 0000000..c33f8a3
--- /dev/null
+++
b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-jdk11-results.txt
@@ -0,0 +1,21 @@
+OpenJDK 64-Bit Server VM 11.0.2+9 on Mac OS X 10.14.6
+Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+Can skip all row groups: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Without nested predicate Pushdown 34214 35752
NaN 3.1 326.3 1.0X
+With nested predicate Pushdown 86 102
11 1216.2 0.8 396.8X
+
+OpenJDK 64-Bit Server VM 11.0.2+9 on Mac OS X 10.14.6
+Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+Can skip some row groups: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Without nested predicate Pushdown 34211 35162
843 3.1 326.3 1.0X
+With nested predicate Pushdown 3470 3514
36 30.2 33.1 9.9X
+
+OpenJDK 64-Bit Server VM 11.0.2+9 on Mac OS X 10.14.6
+Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+Can skip no row groups: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Without nested predicate Pushdown 37533 37919
329 2.8 357.9 1.0X
+With nested predicate Pushdown 37876 39132
536 2.8 361.2 1.0X
+
diff --git
a/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt
b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt
new file mode 100644
index 0000000..35dd4f0
--- /dev/null
+++ b/sql/core/benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt
@@ -0,0 +1,21 @@
+OpenJDK 64-Bit Server VM 1.8.0_252-b09 on Mac OS X 10.14.6
+Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+Can skip all row groups: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Without nested predicate Pushdown 30687 31552
NaN 3.4 292.7 1.0X
+With nested predicate Pushdown 105 150
61 999.3 1.0 292.5X
+
+OpenJDK 64-Bit Server VM 1.8.0_252-b09 on Mac OS X 10.14.6
+Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+Can skip some row groups: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Without nested predicate Pushdown 30505 31828
NaN 3.4 290.9 1.0X
+With nested predicate Pushdown 3156 3215
77 33.2 30.1 9.7X
+
+OpenJDK 64-Bit Server VM 1.8.0_252-b09 on Mac OS X 10.14.6
+Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+Can skip no row groups: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Without nested predicate Pushdown 34475 35302
NaN 3.0 328.8 1.0X
+With nested predicate Pushdown 34003 34596
567 3.1 324.3 1.0X
+
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala
new file mode 100644
index 0000000..11bc91a
--- /dev/null
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import org.apache.spark.SparkConf
+import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Synthetic benchmark for nested fields predicate push down performance for
Parquet datasource.
+ * To run this benchmark:
+ * {{{
+ * 1. without sbt:
+ * bin/spark-submit --class <this class> --jars <spark core test jar>
<sql core test jar>
+ * 2. build/sbt "sql/test:runMain <this class>"
+ * 3. generate result:
+ * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this
class>"
+ * Results will be written to
"benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt".
+ * }}}
+ */
+object ParquetNestedPredicatePushDownBenchmark extends SqlBasedBenchmark {
+
+ private val N = 100 * 1024 * 1024
+ private val NUMBER_OF_ITER = 10
+
+ private val df: DataFrame = spark
+ .range(1, N, 1, 4)
+ .toDF("id")
+ .selectExpr("id", "STRUCT(id x, STRUCT(CAST(id AS STRING) z) y) nested")
+ .sort("id")
+
+ private def addCase(
+ benchmark: Benchmark,
+ inputPath: String,
+ enableNestedPD: Boolean,
+ name: String,
+ withFilter: DataFrame => DataFrame): Unit = {
+ val loadDF = spark.read.parquet(inputPath)
+ benchmark.addCase(name) { _ =>
+ withSQLConf((SQLConf.NESTED_PREDICATE_PUSHDOWN_ENABLED.key,
enableNestedPD.toString)) {
+ withFilter(loadDF).noop()
+ }
+ }
+ }
+
+ private def createAndRunBenchmark(name: String, withFilter: DataFrame =>
DataFrame): Unit = {
+ withTempPath { tempDir =>
+ val outputPath = tempDir.getCanonicalPath
+ df.write.mode(SaveMode.Overwrite).parquet(outputPath)
+ val benchmark = new Benchmark(name, N, NUMBER_OF_ITER, output = output)
+ addCase(
+ benchmark,
+ outputPath,
+ enableNestedPD = false,
+ "Without nested predicate Pushdown",
+ withFilter)
+ addCase(
+ benchmark,
+ outputPath,
+ enableNestedPD = true,
+ "With nested predicate Pushdown",
+ withFilter)
+ benchmark.run()
+ }
+ }
+
+ /**
+ * Benchmark for sorted data with a filter which allows to filter out all
the row groups
+ * when nested fields predicate push down enabled
+ */
+ def runLoadNoRowGroupWhenPredicatePushedDown(): Unit = {
+ createAndRunBenchmark("Can skip all row groups", _.filter("nested.x < 0"))
+ }
+
+ /**
+ * Benchmark with a filter which allows to load only some row groups
+ * when nested fields predicate push down enabled
+ */
+ def runLoadSomeRowGroupWhenPredicatePushedDown(): Unit = {
+ createAndRunBenchmark("Can skip some row groups", _.filter("nested.x =
100"))
+ }
+
+ /**
+ * Benchmark with a filter which still requires to
+ * load all the row groups on sorted data to see if we introduce too much
+ * overhead or not if enable nested predicate push down.
+ */
+ def runLoadAllRowGroupsWhenPredicatePushedDown(): Unit = {
+ createAndRunBenchmark("Can skip no row groups", _.filter(s"nested.x >= 0
and nested.x <= $N"))
+ }
+
+ override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+ runLoadNoRowGroupWhenPredicatePushedDown()
+ runLoadSomeRowGroupWhenPredicatePushedDown()
+ runLoadAllRowGroupsWhenPredicatePushedDown()
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]