spark git commit: [SPARK-25589][SQL][TEST] Add BloomFilterBenchmark

dongjoon Wed, 03 Oct 2018 04:15:02 -0700

Repository: spark
Updated Branches:
  refs/heads/master 928d0739c -> 1a5d83bed



[SPARK-25589][SQL][TEST] Add BloomFilterBenchmark

## What changes were proposed in this pull request?

This PR aims to add `BloomFilterBenchmark`. For ORC data source, Apache Spark 
has been supporting for a long time. For Parquet data source, it's expected to 
be added with next Parquet release update.

## How was this patch tested?

Manual.

```scala
SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain 
org.apache.spark.sql.execution.benchmark.BloomFilterBenchmark"
```

Closes #22605 from dongjoon-hyun/SPARK-25589.

Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a5d83be
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a5d83be
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a5d83be

Branch: refs/heads/master
Commit: 1a5d83bed8a6df62ef643b08453c7dd8feebf93a
Parents: 928d073
Author: Dongjoon Hyun <[email protected]>
Authored: Wed Oct 3 04:14:07 2018 -0700
Committer: Dongjoon Hyun <[email protected]>
Committed: Wed Oct 3 04:14:07 2018 -0700

----------------------------------------------------------------------
 .../benchmarks/BloomFilterBenchmark-results.txt | 24 ++++++
 .../benchmark/BloomFilterBenchmark.scala        | 87 ++++++++++++++++++++
 2 files changed, 111 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1a5d83be/sql/core/benchmarks/BloomFilterBenchmark-results.txt
----------------------------------------------------------------------
diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt 
b/sql/core/benchmarks/BloomFilterBenchmark-results.txt
new file mode 100644
index 0000000..2eeb26c
--- /dev/null
+++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt
@@ -0,0 +1,24 @@
+================================================================================================
+ORC Write
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Write 100M rows:                         Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Without bloom filter                        16765 / 17587          6.0         
167.7       1.0X
+With bloom filter                           20060 / 20626          5.0         
200.6       0.8X
+
+
+================================================================================================
+ORC Read
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Read a row from 100M rows:               Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Without bloom filter                          1857 / 1904         53.9         
 18.6       1.0X
+With bloom filter                             1399 / 1437         71.5         
 14.0       1.3X
+
+

http://git-wip-us.apache.org/repos/asf/spark/blob/1a5d83be/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala
new file mode 100644
index 0000000..2f3caca
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import scala.util.Random
+
+import org.apache.spark.benchmark.Benchmark
+
+/**
+ * Benchmark to measure read performance with Bloom filters.
+ *
+ * Currently, only ORC supports bloom filters, we will add Parquet BM as soon 
as it becomes
+ * available.
+ *
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt: bin/spark-submit --class <this class> <spark sql test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt 
"sql/test:runMain <this class>"
+ *      Results will be written to 
"benchmarks/BloomFilterBenchmark-results.txt".
+ * }}}
+ */
+object BloomFilterBenchmark extends SqlBasedBenchmark {
+  import spark.implicits._
+
+  private val scaleFactor = 100
+  private val N = scaleFactor * 1000 * 1000
+  private val df = spark.range(N).map(_ => Random.nextInt)
+
+  private def writeBenchmark(): Unit = {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      runBenchmark(s"ORC Write") {
+        val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output 
= output)
+        benchmark.addCase("Without bloom filter") { _ =>
+          df.write.mode("overwrite").orc(path + "/withoutBF")
+        }
+        benchmark.addCase("With bloom filter") { _ =>
+          df.write.mode("overwrite")
+            .option("orc.bloom.filter.columns", "value").orc(path + "/withBF")
+        }
+        benchmark.run()
+      }
+    }
+  }
+
+  private def readBenchmark(): Unit = {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      df.write.orc(path + "/withoutBF")
+      df.write.option("orc.bloom.filter.columns", "value").orc(path + 
"/withBF")
+
+      runBenchmark(s"ORC Read") {
+        val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", 
N, output = output)
+        benchmark.addCase("Without bloom filter") { _ =>
+          spark.read.orc(path + "/withoutBF").where("value = 0").count
+        }
+        benchmark.addCase("With bloom filter") { _ =>
+          spark.read.orc(path + "/withBF").where("value = 0").count
+        }
+        benchmark.run()
+      }
+    }
+  }
+
+  override def runBenchmarkSuite(): Unit = {
+    writeBenchmark()
+    readBenchmark()
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-25589][SQL][TEST] Add BloomFilterBenchmark

Reply via email to