[
https://issues.apache.org/jira/browse/SPARK-26345?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17248374#comment-17248374
]
Yuming Wang commented on SPARK-26345:
-------------------------------------
Benchmark and benchmark result:
{code:scala}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.benchmark
import java.io.File
import scala.util.Random
import org.apache.spark.SparkConf
import org.apache.spark.benchmark.Benchmark
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions.{monotonically_increasing_id,
timestamp_seconds}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType}
/**
* Benchmark to measure read performance with Parquet column index.
* To run this benchmark:
* {{{
* 1. without sbt: bin/spark-submit --class <this class> <spark sql test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
"sql/test:runMain <this class>"
* Results will be written to
"benchmarks/ParquetFilterPushdownBenchmark-results.txt".
* }}}
*/
object ParquetFilterPushdownBenchmark extends SqlBasedBenchmark {
override def getSparkSession: SparkSession = {
val conf = new SparkConf()
.setAppName(this.getClass.getSimpleName)
// Since `spark.master` always exists, overrides this value
.set("spark.master", "local[1]")
.setIfMissing("spark.driver.memory", "3g")
.setIfMissing("spark.executor.memory", "3g")
.setIfMissing("orc.compression", "snappy")
.setIfMissing("spark.sql.parquet.compression.codec", "snappy")
SparkSession.builder().config(conf).getOrCreate()
}
private val numRows = 1024 * 1024 * 15
private val width = 5
private val mid = numRows / 2
def withTempTable(tableNames: String*)(f: => Unit): Unit = {
try f finally tableNames.foreach(spark.catalog.dropTempView)
}
private def prepareTable(
dir: File, numRows: Int, width: Int, useStringForValue: Boolean): Unit = {
import spark.implicits._
val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
val valueCol = if (useStringForValue) {
monotonically_increasing_id().cast("string")
} else {
monotonically_increasing_id()
}
val df = spark.range(numRows).map(_ =>
Random.nextLong).selectExpr(selectExpr: _*)
.withColumn("value", valueCol)
.sort("value")
saveAsTable(df, dir)
}
private def prepareStringDictTable(
dir: File, numRows: Int, numDistinctValues: Int, width: Int): Unit = {
val selectExpr = (0 to width).map {
case 0 => s"CAST(id % $numDistinctValues AS STRING) AS value"
case i => s"CAST(rand() AS STRING) c$i"
}
val df = spark.range(numRows).selectExpr(selectExpr: _*).sort("value")
saveAsTable(df, dir, true)
}
private def saveAsTable(df: DataFrame, dir: File, useDictionary: Boolean =
false): Unit = {
val parquetPath = dir.getCanonicalPath + "/parquet"
df.write.mode("overwrite").parquet(parquetPath)
spark.read.parquet(parquetPath).createOrReplaceTempView("parquetTable")
}
def filterPushDownBenchmark(
values: Int,
title: String,
whereExpr: String,
selectExpr: String = "*"): Unit = {
val benchmark = new Benchmark(title, values, minNumIters = 5, output =
output)
Seq(false, true).foreach { columnIndexEnabled =>
val name = s"Parquet Vectorized ${if (columnIndexEnabled)
s"(columnIndex)" else ""}"
benchmark.addCase(name) { _ =>
withSQLConf("parquet.filter.columnindex.enabled" ->
s"$columnIndexEnabled") {
spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE
$whereExpr").noop()
}
}
}
benchmark.run()
}
private def runIntBenchmark(numRows: Int, width: Int, mid: Int): Unit = {
Seq("value IS NULL", s"$mid < value AND value < $mid").foreach { whereExpr
=>
val title = s"Select 0 int row ($whereExpr)".replace("value AND value",
"value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
Seq(
s"value = $mid",
s"value <=> $mid",
s"$mid <= value AND value <= $mid",
s"${mid - 1} < value AND value < ${mid + 1}"
).foreach { whereExpr =>
val title = s"Select 1 int row ($whereExpr)".replace("value AND value",
"value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ",
MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% int rows (value < ${numRows * percent / 100})",
s"value < ${numRows * percent / 100}",
selectExpr
)
}
Seq("value IS NOT NULL", "value > -1", "value != -1").foreach { whereExpr =>
filterPushDownBenchmark(
numRows,
s"Select all int rows ($whereExpr)",
whereExpr,
selectExpr)
}
}
private def runStringBenchmark(
numRows: Int, width: Int, searchValue: Int, colType: String): Unit = {
Seq("value IS NULL", s"'$searchValue' < value AND value < '$searchValue'")
.foreach { whereExpr =>
val title = s"Select 0 $colType row ($whereExpr)".replace("value AND
value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
Seq(
s"value = '$searchValue'",
s"value <=> '$searchValue'",
s"'$searchValue' <= value AND value <= '$searchValue'"
).foreach { whereExpr =>
val title = s"Select 1 $colType row ($whereExpr)".replace("value AND
value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ",
MAX(value)")
Seq("value IS NOT NULL").foreach { whereExpr =>
filterPushDownBenchmark(
numRows,
s"Select all $colType rows ($whereExpr)",
whereExpr,
selectExpr)
}
}
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runBenchmark("Pushdown for many distinct value case") {
withTempPath { dir =>
withTempTable("parquetTable") {
Seq(true, false).foreach { useStringForValue =>
prepareTable(dir, numRows, width, useStringForValue)
if (useStringForValue) {
runStringBenchmark(numRows, width, mid, "string")
} else {
runIntBenchmark(numRows, width, mid)
}
}
}
}
}
runBenchmark("Pushdown for few distinct value case (use dictionary
encoding)") {
withTempPath { dir =>
val numDistinctValues = 200
withTempTable("parquetTable") {
prepareStringDictTable(dir, numRows, numDistinctValues, width)
runStringBenchmark(numRows, width, numDistinctValues / 2, "distinct
string")
}
}
}
runBenchmark("Pushdown benchmark for StringStartsWith") {
withTempPath { dir =>
withTempTable("parquetTable") {
prepareTable(dir, numRows, width, true)
Seq(
"value like '10%'",
"value like '1000%'",
s"value like '${mid.toString.substring(0, mid.toString.length -
1)}%'"
).foreach { whereExpr =>
val title = s"StringStartsWith filter: ($whereExpr)"
filterPushDownBenchmark(numRows, title, whereExpr)
}
}
}
}
runBenchmark(s"Pushdown benchmark for ${DecimalType.simpleString}") {
withTempPath { dir =>
Seq(
s"decimal(${Decimal.MAX_INT_DIGITS}, 2)",
s"decimal(${Decimal.MAX_LONG_DIGITS}, 2)",
s"decimal(${DecimalType.MAX_PRECISION}, 2)"
).foreach { dt =>
val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
val valueCol = if
(dt.equalsIgnoreCase(s"decimal(${Decimal.MAX_INT_DIGITS}, 2)")) {
monotonically_increasing_id() % 9999999
} else {
monotonically_increasing_id()
}
val df = spark.range(numRows)
.selectExpr(columns: _*).withColumn("value", valueCol.cast(dt))
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(s"value = $mid").foreach { whereExpr =>
val title = s"Select 1 $dt row ($whereExpr)".replace("value AND
value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("",
",", ", MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% $dt rows (value < ${numRows * percent /
100})",
s"value < ${numRows * percent / 100}",
selectExpr
)
}
}
}
}
}
runBenchmark("Pushdown benchmark for InSet -> InFilters") {
withTempPath { dir =>
withTempTable("parquetTable") {
prepareTable(dir, numRows, width, false)
Seq(5, 10, 50, 100).foreach { count =>
Seq(10, 50, 90).foreach { distribution =>
val filter =
Range(0, count).map(r => scala.util.Random.nextInt(numRows *
distribution / 100))
val whereExpr = s"value in(${filter.mkString(",")})"
val title = s"InSet -> InFilters (values count: $count,
distribution: $distribution)"
filterPushDownBenchmark(numRows, title, whereExpr)
}
}
}
}
}
runBenchmark(s"Pushdown benchmark for ${ByteType.simpleString}") {
withTempPath { dir =>
val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
val df = spark.range(numRows).selectExpr(columns: _*)
.withColumn("value", (monotonically_increasing_id() %
Byte.MaxValue).cast(ByteType))
.orderBy("value")
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(s"value = CAST(${Byte.MaxValue / 2} AS ${ByteType.simpleString})")
.foreach { whereExpr =>
val title = s"Select 1 ${ByteType.simpleString} row ($whereExpr)"
.replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",",
", MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% ${ByteType.simpleString} rows " +
s"(value < CAST(${Byte.MaxValue * percent / 100} AS
${ByteType.simpleString}))",
s"value < CAST(${Byte.MaxValue * percent / 100} AS
${ByteType.simpleString})",
selectExpr
)
}
}
}
}
runBenchmark(s"Pushdown benchmark for Timestamp") {
withTempPath { dir =>
withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED.key ->
true.toString) {
ParquetOutputTimestampType.values.toSeq.map(_.toString).foreach {
fileType =>
withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> fileType) {
val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
val df = spark.range(numRows).selectExpr(columns: _*)
.withColumn("value",
timestamp_seconds(monotonically_increasing_id()))
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(s"value = timestamp_seconds($mid)").foreach { whereExpr =>
val title = s"Select 1 timestamp stored as $fileType row
($whereExpr)"
.replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width)
.map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% timestamp stored as $fileType rows " +
s"(value < timestamp_seconds(${numRows * percent /
100}))",
s"value < timestamp_seconds(${numRows * percent / 100})",
selectExpr
)
}
}
}
}
}
}
}
runBenchmark(s"Pushdown benchmark with many filters") {
val numRows = 1
val width = 500
withTempPath { dir =>
val columns = (1 to width).map(i => s"id c$i")
val df = spark.range(1).selectExpr(columns: _*)
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(1, 250, 500).foreach { numFilter =>
val whereExpr = (1 to numFilter).map(i => s"c$i = 0").mkString("
and ")
// Note: InferFiltersFromConstraints will add more filters to this
given filters
filterPushDownBenchmark(numRows, s"Select 1 row with $numFilter
filters", whereExpr)
}
}
}
}
}
}
{code}
{noformat}
================================================================================================
Pushdown for many distinct value case
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 99 111
10 158.9 6.3 1.0X
Parquet Vectorized (columnindex) 78 86
9 201.6 5.0 1.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 835
848 17 18.8 53.1 1.0X
Parquet Vectorized (columnindex) 91
96 4 173.5 5.8 9.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 839 846
6 18.7 53.3 1.0X
Parquet Vectorized (columnindex) 85 93
10 184.8 5.4 9.9X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 838 852
16 18.8 53.3 1.0X
Parquet Vectorized (columnindex) 79 85
4 197.9 5.1 10.5X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 872
907 45 18.0 55.4 1.0X
Parquet Vectorized (columnindex) 83
89 5 188.9 5.3 10.5X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 16487 16727
253 1.0 1048.2 1.0X
Parquet Vectorized (columnindex) 16355 16426
62 1.0 1039.8 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 64 68
5 246.7 4.1 1.0X
Parquet Vectorized (columnindex) 61 66
4 258.0 3.9 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 770 795
34 20.4 48.9 1.0X
Parquet Vectorized (columnindex) 78 84
5 201.1 5.0 9.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 774 795
21 20.3 49.2 1.0X
Parquet Vectorized (columnindex) 77 82
6 205.3 4.9 10.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 771 777
7 20.4 49.0 1.0X
Parquet Vectorized (columnindex) 69 76
5 226.8 4.4 11.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 769 794
29 20.4 48.9 1.0X
Parquet Vectorized (columnindex) 74 82
6 213.3 4.7 10.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 775 825
43 20.3 49.3 1.0X
Parquet Vectorized (columnindex) 76 81
5 206.3 4.8 10.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2326 2395
51 6.8 147.9 1.0X
Parquet Vectorized (columnindex) 1655 1669
13 9.5 105.2 1.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 7898 8175
157 2.0 502.1 1.0X
Parquet Vectorized (columnindex) 7658 7731
73 2.1 486.9 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 13778 13858
78 1.1 876.0 1.0X
Parquet Vectorized (columnindex) 13771 13885
105 1.1 875.5 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 15072 15281
163 1.0 958.3 1.0X
Parquet Vectorized (columnindex) 15119 15344
194 1.0 961.3 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all int rows (value > -1): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 15257 15378
198 1.0 970.0 1.0X
Parquet Vectorized (columnindex) 15296 15519
232 1.0 972.5 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all int rows (value != -1): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 15262 15325
92 1.0 970.4 1.0X
Parquet Vectorized (columnindex) 15173 15255
84 1.0 964.7 1.0X
================================================================================================
Pushdown for few distinct value case (use dictionary encoding)
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 53 59
5 298.8 3.3 1.0X
Parquet Vectorized (columnindex) 52 57
6 300.2 3.3 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 890
902 7 17.7 56.6 1.0X
Parquet Vectorized (columnindex) 59
62 4 266.2 3.8 15.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 894 905
10 17.6 56.9 1.0X
Parquet Vectorized (columnindex) 125 130
6 126.2 7.9 7.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 901 920
26 17.5 57.3 1.0X
Parquet Vectorized (columnindex) 119 127
4 132.1 7.6 7.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 902
914 10 17.4 57.3 1.0X
Parquet Vectorized (columnindex) 126
132 7 124.8 8.0 7.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 16872
16947 97 0.9 1072.7 1.0X
Parquet Vectorized (columnindex) 16861
16970 80 0.9 1072.0 1.0X
================================================================================================
Pushdown benchmark for StringStartsWith
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1025 1038
17 15.4 65.1 1.0X
Parquet Vectorized (columnindex) 852 868
16 18.5 54.2 1.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 816 838
25 19.3 51.9 1.0X
Parquet Vectorized (columnindex) 74 79
5 213.7 4.7 11.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 817 836
11 19.2 52.0 1.0X
Parquet Vectorized (columnindex) 76 82
4 207.6 4.8 10.8X
================================================================================================
Pushdown benchmark for decimal
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1125 1138
13 14.0 71.6 1.0X
Parquet Vectorized (columnindex) 50 55
5 313.1 3.2 22.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 4800 4930
131 3.3 305.2 1.0X
Parquet Vectorized (columnindex) 2227 2274
40 7.1 141.6 2.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 10016 10204
202 1.6 636.8 1.0X
Parquet Vectorized (columnindex) 9571 9677
63 1.6 608.5 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 11161 11403
187 1.4 709.6 1.0X
Parquet Vectorized (columnindex) 11103 11283
130 1.4 705.9 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1150 1168
21 13.7 73.1 1.0X
Parquet Vectorized (columnindex) 45 48
5 350.0 2.9 25.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1946 1978
34 8.1 123.7 1.0X
Parquet Vectorized (columnindex) 1155 1189
28 13.6 73.4 1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 6206 6413
211 2.5 394.6 1.0X
Parquet Vectorized (columnindex) 5659 5786
96 2.8 359.8 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 10375 10534
240 1.5 659.6 1.0X
Parquet Vectorized (columnindex) 10120 10334
221 1.6 643.4 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1694 1748
49 9.3 107.7 1.0X
Parquet Vectorized (columnindex) 46 50
4 338.6 3.0 36.5X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2712 2782
48 5.8 172.5 1.0X
Parquet Vectorized (columnindex) 1584 1611
32 9.9 100.7 1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8356 8499
82 1.9 531.3 1.0X
Parquet Vectorized (columnindex) 7781 7979
123 2.0 494.7 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 14128 14304
182 1.1 898.2 1.0X
Parquet Vectorized (columnindex) 13940 14004
44 1.1 886.3 1.0X
================================================================================================
Pushdown benchmark for InSet -> InFilters
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 773
784 11 20.4 49.1 1.0X
Parquet Vectorized (columnindex) 113
118 5 139.4 7.2 6.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2910
2930 13 5.4 185.0 1.0X
Parquet Vectorized (columnindex) 116
120 4 136.1 7.3 25.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2207
2218 8 7.1 140.3 1.0X
Parquet Vectorized (columnindex) 117
123 8 134.3 7.4 18.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1493
1506 11 10.5 94.9 1.0X
Parquet Vectorized (columnindex) 159
164 6 99.2 10.1 9.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 3591
3635 32 4.4 228.3 1.0X
Parquet Vectorized (columnindex) 170
175 5 92.6 10.8 21.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 5079
5147 50 3.1 322.9 1.0X
Parquet Vectorized (columnindex) 172
180 5 91.2 11.0 29.5X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8280
8533 245 1.9 526.4 1.0X
Parquet Vectorized (columnindex) 8341
8423 93 1.9 530.3 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8248
8347 72 1.9 524.4 1.0X
Parquet Vectorized (columnindex) 8230
8303 66 1.9 523.2 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8219
8285 44 1.9 522.6 1.0X
Parquet Vectorized (columnindex) 8183
8381 184 1.9 520.3 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8354
8411 60 1.9 531.1 1.0X
Parquet Vectorized (columnindex) 8181
8256 60 1.9 520.1 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8151
8210 38 1.9 518.3 1.0X
Parquet Vectorized (columnindex) 8169
8210 37 1.9 519.4 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8131
8204 46 1.9 516.9 1.0X
Parquet Vectorized (columnindex) 8167
8231 65 1.9 519.3 1.0X
================================================================================================
Pushdown benchmark for tinyint
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1124
1198 68 14.0 71.4 1.0X
Parquet Vectorized (columnindex) 91
93 2 173.7 5.8 12.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1845
1892 37 8.5 117.3 1.0X
Parquet Vectorized (columnindex) 1113
1123 11 14.1 70.8 1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 5850
5911 73 2.7 371.9 1.0X
Parquet Vectorized (columnindex) 5450
5567 91 2.9 346.5 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 10395
10458 62 1.5 660.9 1.0X
Parquet Vectorized (columnindex) 9928
10104 176 1.6 631.2 1.0X
================================================================================================
Pushdown benchmark for Timestamp
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)):
Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
3929 4094 107 4.0 249.8 1.0X
Parquet Vectorized (columnindex)
3991 4068 74 3.9 253.7 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)):
Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
4774 5003 163 3.3 303.5 1.0X
Parquet Vectorized (columnindex)
4769 4880 159 3.3 303.2 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)):
Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
7736 7884 142 2.0 491.9 1.0X
Parquet Vectorized (columnindex)
7587 7795 156 2.1 482.4 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% timestamp stored as INT96 rows (value <
timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
10722 10785 61 1.5 681.7 1.0X
Parquet Vectorized (columnindex)
10719 10775 55 1.5 681.5 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 timestamp stored as TIMESTAMP_MICROS row (value =
timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
1127 1172 47 14.0 71.6
1.0X
Parquet Vectorized (columnindex)
44 46 3 360.8 2.8
25.9X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% timestamp stored as TIMESTAMP_MICROS rows (value <
timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
1861 1916 90 8.5 118.3
1.0X
Parquet Vectorized (columnindex)
1127 1160 22 14.0 71.7
1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% timestamp stored as TIMESTAMP_MICROS rows (value <
timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
5809 5934 214 2.7 369.3
1.0X
Parquet Vectorized (columnindex)
5455 5523 93 2.9 346.8
1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% timestamp stored as TIMESTAMP_MICROS rows (value <
timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
9777 10097 244 1.6 621.6
1.0X
Parquet Vectorized (columnindex)
9808 9849 44 1.6 623.6
1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 timestamp stored as TIMESTAMP_MILLIS row (value =
timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
1125 1163 24 14.0 71.5
1.0X
Parquet Vectorized (columnindex)
43 47 5 369.3 2.7
26.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value <
timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
1905 1977 80 8.3 121.1
1.0X
Parquet Vectorized (columnindex)
1137 1186 40 13.8 72.3
1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value <
timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
6018 6266 230 2.6 382.6
1.0X
Parquet Vectorized (columnindex)
5631 5703 69 2.8 358.0
1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value <
timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms)
Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized
10132 10224 113 1.6 644.2
1.0X
Parquet Vectorized (columnindex)
9898 9992 69 1.6 629.3
1.0X
================================================================================================
Pushdown benchmark with many filters
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 row with 1 filters: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 182 190
7 0.0 182396465.0 1.0X
Parquet Vectorized (columnindex) 187 192
5 0.0 187246572.0 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 row with 250 filters: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2228 2257
19 0.0 2228318860.0 1.0X
Parquet Vectorized (columnindex) 2212 2244
24 0.0 2212486315.0 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 row with 500 filters: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 12391 12473
86 0.0 12391350628.0 1.0X
Parquet Vectorized (columnindex) 12438 12594
229 0.0 12438065459.0 1.0X
{noformat}
> Parquet support Column indexes
> ------------------------------
>
> Key: SPARK-26345
> URL: https://issues.apache.org/jira/browse/SPARK-26345
> Project: Spark
> Issue Type: Umbrella
> Components: SQL
> Affects Versions: 3.1.0
> Reporter: Yuming Wang
> Priority: Major
>
> Parquet 1.11.0 supports column indexing. Spark can supports this feature for
> good read performance.
> More details:
> https://issues.apache.org/jira/browse/PARQUET-1201
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]