Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20265#discussion_r161421828 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala --- @@ -483,6 +484,64 @@ object OrcReadBenchmark { } } + def filterPushDownBenchmark(values: Int, width: Int): Unit = { + val benchmark = new Benchmark(s"Filter Pushdown", values) + + withTempPath { dir => + withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { + import spark.implicits._ + val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i") + val whereExpr = (1 to width).map(i => s"NOT c$i LIKE '%not%exist%'").mkString(" AND ") + val df = spark.range(values).map(_ => Random.nextLong).selectExpr(selectExpr: _*) + .withColumn("uniqueID", monotonically_increasing_id()) + + df.createOrReplaceTempView("t1") + prepareTable(dir, spark.sql("SELECT * FROM t1")) + + Seq(false, true).foreach { value => + benchmark.addCase(s"Native ORC MR (Pushdown=$value)") { _ => + withSQLConf( + SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$value") { + spark.sql( + s""" + |SELECT c1 + |FROM nativeOrcTable + |WHERE uniqueID = 0 AND $whereExpr + """.stripMargin).collect() + } + } + } + + Seq(false, true).foreach { value => + benchmark.addCase(s"Native ORC Vectorized (Pushdown=$value)") { _ => + withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$value") { + spark.sql( + s""" + |SELECT c1 + |FROM nativeOrcTable + |WHERE uniqueID = 0 AND $whereExpr + """.stripMargin).collect() + } + } + } + + /* + Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2 + Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz + + Filter Pushdown: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Native ORC MR (Pushdown=false) 16169 / 16193 0.3 3084.0 1.0X --- End diff -- let's focus on PPD for this benchmark and not disable vectorized reader. e.g. ``` col LIKE '%not%exist%' col LIKE '%not%exist%' (Pushdown) col = 3 col = 3 (Pushdown) ... ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org