Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19943#discussion_r160016468
--- Diff:
sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala ---
@@ -0,0 +1,435 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import scala.util.{Random, Try}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+import org.apache.spark.util.{Benchmark, Utils}
+
+
+/**
+ * Benchmark to measure ORC read performance.
+ *
+ * This is in `sql/hive` module in order to compare `sql/core` and
`sql/hive` ORC data sources.
+ */
+// scalastyle:off line.size.limit
+object OrcReadBenchmark {
+ val conf = new SparkConf()
+ conf.set("orc.compression", "snappy")
+
+ private val spark = SparkSession.builder()
+ .master("local[1]")
+ .appName("OrcReadBenchmark")
+ .config(conf)
+ .getOrCreate()
+
+ // Set default configs. Individual cases will change them if necessary.
+ spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true")
+
+ def withTempPath(f: File => Unit): Unit = {
+ val path = Utils.createTempDir()
+ path.delete()
+ try f(path) finally Utils.deleteRecursively(path)
+ }
+
+ def withTempTable(tableNames: String*)(f: => Unit): Unit = {
+ try f finally tableNames.foreach(spark.catalog.dropTempView)
+ }
+
+ def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+ val (keys, values) = pairs.unzip
+ val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
+ (keys, values).zipped.foreach(spark.conf.set)
+ try f finally {
+ keys.zip(currentValues).foreach {
+ case (key, Some(value)) => spark.conf.set(key, value)
+ case (key, None) => spark.conf.unset(key)
+ }
+ }
+ }
+
+ private val NATIVE_ORC_FORMAT =
"org.apache.spark.sql.execution.datasources.orc.OrcFileFormat"
+ private val HIVE_ORC_FORMAT =
"org.apache.spark.sql.hive.orc.OrcFileFormat"
+
+ private def prepareTable(dir: File, df: DataFrame, partition:
Option[String] = None): Unit = {
+ val dirORC = dir.getCanonicalPath
+
+ if (partition.isDefined) {
+ df.write.partitionBy(partition.get).orc(dirORC)
+ } else {
+ df.write.orc(dirORC)
+ }
+
+
spark.read.format(NATIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("nativeOrcTable")
+
spark.read.format(HIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("hiveOrcTable")
+ }
+
+ def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
+ val sqlBenchmark = new Benchmark(s"SQL Single ${dataType.sql} Column
Scan", values)
+
+ withTempPath { dir =>
+ withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+ import spark.implicits._
+ spark.range(values).map(_ =>
Random.nextLong).createOrReplaceTempView("t1")
+
+ prepareTable(dir, spark.sql(s"SELECT CAST(value as
${dataType.sql}) id FROM t1"))
+
+ sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+ spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+ }
+
+ sqlBenchmark.addCase("Native ORC MR") { _ =>
+ withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key ->
"false") {
+ spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+ }
+ }
+
+ sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+ spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+ }
+
+ /*
+ Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.1
+ Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+ SQL Single TINYINT Column Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 156 / 163
100.7 9.9 1.0X
+ Native ORC MR 1222 / 1236
12.9 77.7 0.1X
+ Hive built-in ORC 1572 / 1625
10.0 100.0 0.1X
+
+ SQL Single SMALLINT Column Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 215 / 225
73.1 13.7 1.0X
+ Native ORC MR 1337 / 1458
11.8 85.0 0.2X
+ Hive built-in ORC 1696 / 1707
9.3 107.8 0.1X
+
+ SQL Single INT Column Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 285 / 294
55.2 18.1 1.0X
+ Native ORC MR 1397 / 1419
11.3 88.8 0.2X
+ Hive built-in ORC 2086 / 2143
7.5 132.6 0.1X
+
+ SQL Single BIGINT Column Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 341 / 350
46.1 21.7 1.0X
+ Native ORC MR 1461 / 1492
10.8 92.9 0.2X
+ Hive built-in ORC 2002 / 2095
7.9 127.3 0.2X
+
+ SQL Single FLOAT Column Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 347 / 356
45.3 22.1 1.0X
+ Native ORC MR 1524 / 1553
10.3 96.9 0.2X
+ Hive built-in ORC 2393 / 2404
6.6 152.1 0.1X
+
+ SQL Single DOUBLE Column Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 403 / 411
39.0 25.6 1.0X
+ Native ORC MR 1517 / 1529
10.4 96.5 0.3X
+ Hive built-in ORC 2054 / 2134
7.7 130.6 0.2X
+ */
+ sqlBenchmark.run()
+ }
+ }
+ }
+
+ def intStringScanBenchmark(values: Int): Unit = {
+ val benchmark = new Benchmark("Int and String Scan", values)
+
+ withTempPath { dir =>
+ withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+ import spark.implicits._
+ spark.range(values).map(_ =>
Random.nextLong).createOrReplaceTempView("t1")
+
+ prepareTable(
+ dir,
+ spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as
STRING) AS c2 FROM t1"))
+
+ benchmark.addCase("Native ORC Vectorized") { _ =>
+ spark.sql("SELECT sum(c1), sum(length(c2)) FROM
nativeOrcTable").collect()
+ }
+
+ benchmark.addCase("Native ORC MR") { _ =>
+ withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key ->
"false") {
+ spark.sql("SELECT sum(c1), sum(length(c2)) FROM
nativeOrcTable").collect()
+ }
+ }
+
+ benchmark.addCase("Hive built-in ORC") { _ =>
+ spark.sql("SELECT sum(c1), sum(length(c2)) FROM
hiveOrcTable").collect()
+ }
+
+ /*
+ Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.1
+ Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+ Int and String Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 1382 / 1400
7.6 131.8 1.0X
+ Native ORC MR 2689 / 2765
3.9 256.4 0.5X
+ Hive built-in ORC 3889 / 3894
2.7 370.9 0.4X
+ */
+ benchmark.run()
+ }
+ }
+ }
+
+ def partitionTableScanBenchmark(values: Int): Unit = {
+ val benchmark = new Benchmark("Partitioned Table", values)
+
+ withTempPath { dir =>
+ withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+ import spark.implicits._
+ spark.range(values).map(_ =>
Random.nextLong).createOrReplaceTempView("t1")
+
+ prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id
FROM t1"), Some("p"))
+
+ benchmark.addCase("Read data column - Native ORC Vectorized") { _
=>
+ spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+ }
+
+ benchmark.addCase("Read data column - Native ORC MR") { _ =>
+ withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key ->
"false") {
+ spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+ }
+ }
+
+ benchmark.addCase("Read data column - Hive built-in ORC") { _ =>
+ spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+ }
+
+ benchmark.addCase("Read partition column - Native ORC Vectorized")
{ _ =>
+ spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+ }
+
+ benchmark.addCase("Read partition column - Native ORC MR") { _ =>
+ withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key ->
"false") {
+ spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+ }
+ }
+
+ benchmark.addCase("Read partition column - Hive built-in ORC") { _
=>
+ spark.sql("SELECT sum(p) FROM hiveOrcTable").collect()
+ }
+
+ benchmark.addCase("Read both columns - Native ORC Vectorized") { _
=>
+ spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+ }
+
+ benchmark.addCase("Read both columns - Native ORC MR") { _ =>
+ withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key ->
"false") {
+ spark.sql("SELECT sum(p), sum(id) FROM
nativeOrcTable").collect()
+ }
+ }
+
+ benchmark.addCase("Read both columns - Hive built-in ORC") { _ =>
+ spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect()
+ }
+
+ /*
+ Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.1
+ Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+ Partitioned Table: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Read data column - Native ORC Vectorized 347 / 350
45.3 22.1 1.0X
+ Read data column - Native ORC MR 1611 / 1613
9.8 102.4 0.2X
+ Read data column - Hive built-in ORC 2082 / 2091
7.6 132.4 0.2X
+ Read partition column - Native ORC Vectorized 55 / 57
286.5 3.5 6.3X
+ Read partition column - Native ORC MR 1062 / 1063
14.8 67.5 0.3X
+ Read partition column - Hive built-in ORC 1334 / 1334
11.8 84.8 0.3X
+ Read both columns - Native ORC Vectorized 380 / 388
41.3 24.2 0.9X
+ Read both columns - Native ORC MR 1654 / 1672
9.5 105.2 0.2X
+ Read both columns - Hive built-in ORC 2209 / 2209
7.1 140.5 0.2X
+ */
+ benchmark.run()
+ }
+ }
+ }
+
+ def stringDictionaryScanBenchmark(values: Int): Unit = {
+ val benchmark = new Benchmark("String Dictionary", values)
+
+ withTempPath { dir =>
+ withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+ spark.range(values).createOrReplaceTempView("t1")
+
+ prepareTable(dir, spark.sql("SELECT CAST((id % 200) + 10000 as
STRING) AS c1 FROM t1"))
+
+ benchmark.addCase("Native ORC Vectorized") { _ =>
+ spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+ }
+
+ benchmark.addCase("Native ORC MR") { _ =>
+ withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key ->
"false") {
+ spark.sql("SELECT sum(length(c1)) FROM
nativeOrcTable").collect()
+ }
+ }
+
+ benchmark.addCase("Hive built-in ORC") { _ =>
+ spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect()
+ }
+
+ /*
+ Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.1
+ Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+ String Dictionary: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized 406 / 414
25.8 38.7 1.0X
+ Native ORC MR 1372 / 1381
7.6 130.8 0.3X
+ Hive built-in ORC 2016 / 2036
5.2 192.2 0.2X
+ */
+ benchmark.run()
+ }
+ }
+ }
+
+ def stringWithNullsScanBenchmark(values: Int, fractionOfNulls: Double):
Unit = {
+ withTempPath { dir =>
+ withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+ spark.range(values).createOrReplaceTempView("t1")
+
+ prepareTable(
+ dir,
+ spark.sql(
+ s"SELECT IF(RAND(1) < $fractionOfNulls, NULL, CAST(id as
STRING)) AS c1, " +
+ s"IF(RAND(2) < $fractionOfNulls, NULL, CAST(id as STRING)) AS
c2 FROM t1"))
+
+ val benchmark = new Benchmark("String with Nulls Scan", values)
+
+ benchmark.addCase(s"Native ORC Vectorized ($fractionOfNulls%)") {
_ =>
+ spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+ "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+ }
+
+ benchmark.addCase(s"Native ORC MR ($fractionOfNulls%)") { _ =>
+ withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key ->
"false") {
+ spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+ "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+ }
+ }
+
+ benchmark.addCase(s"Hive built-in ORC ($fractionOfNulls%)") { _ =>
+ spark.sql("SELECT SUM(LENGTH(c2)) FROM hiveOrcTable " +
+ "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+ }
+
+ /*
+ Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.1
+ Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+ String with Nulls Scan: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
+
------------------------------------------------------------------------------------------------
+ Native ORC Vectorized (0.0%) 1122 / 1129
9.3 107.0 1.0X
+ Native ORC MR (0.0%) 2551 / 2619
4.1 243.3 0.4X
--- End diff --
switch the order so that we can use `Native ORC MR` as the baseline, then
the `Relative` part can be more readable.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]