Github user kiszk commented on a diff in the pull request:
https://github.com/apache/spark/pull/21931#discussion_r208561981
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
---
@@ -366,6 +366,43 @@ class AggregateBenchmark extends BenchmarkBase {
*/
}
+ ignore("capacity for fast hash aggregate") {
+ val N = 20 << 20
+ val M = 1 << 19
+
+ val benchmark = new Benchmark("Aggregate w multiple keys", N)
+ sparkSession.range(N)
+ .selectExpr(
+ "id",
+ s"(id % $M) as k1",
+ s"cast(id % $M as int) as k2",
+ s"cast(id % $M as double) as k3",
+ s"cast(id % $M as float) as k4") .createOrReplaceTempView("test")
+
+ def f(): Unit = sparkSession.sql("select k1, k2, k3, k4, sum(k1),
sum(k2), sum(k3), " +
+ "sum(k4) from test group by k1, k2, k3, k4").collect()
+
+ benchmark.addCase(s"fasthash = default") { iter =>
+
sparkSession.conf.set("spark.sql.codegen.aggregate.map.row.capacitybit", "16")
+ f()
+ }
+
+ benchmark.addCase(s"fasthash = config") { iter =>
+
sparkSession.conf.set("spark.sql.codegen.aggregate.map.row.capacitybit", "20")
+ f()
+ }
+
+ benchmark.run()
+
+ /*
+ Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Windows 7 6.1
+ Intel64 Family 6 Model 94 Stepping 3, GenuineIntel
+ Aggregate w multiple keys: Best/Avg Time(ms)
Rate(M/s) Per Row(ns) Relative
--- End diff --
nit: we need to reduce # of characters per line up to 100. IIUC, the number
is more than 100.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]