Github user gatorsmile commented on a diff in the pull request:
https://github.com/apache/spark/pull/21415#discussion_r190661257
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala
---
@@ -74,7 +74,49 @@ object CSVBenchmarks {
}
}
+ def multiColumnsBenchmark(rowsNum: Int): Unit = {
+ val colsNum = 1000
+ val benchmark = new Benchmark(s"Wide rows with $colsNum columns",
rowsNum)
+
+ withTempPath { path =>
+ val fields = Seq.tabulate(colsNum)(i => StructField(s"col$i",
IntegerType))
+ val schema = StructType(fields)
+ val values = (0 until colsNum).map(i => i.toString).mkString(",")
+ val columnNames = schema.fieldNames
+
+ spark.range(rowsNum)
+ .select(Seq.tabulate(colsNum)(i => lit(i).as(s"col$i")): _*)
+ .write.option("header", true)
+ .csv(path.getAbsolutePath)
+
+ val ds = spark.read.schema(schema).csv(path.getAbsolutePath)
+
+ benchmark.addCase(s"Select $colsNum columns", 3) { _ =>
+ ds.select("*").filter((row: Row) => true).count()
+ }
+ val cols100 = columnNames.take(100).map(Column(_))
+ benchmark.addCase(s"Select 100 columns", 3) { _ =>
+ ds.select(cols100: _*).filter((row: Row) => true).count()
+ }
+ benchmark.addCase(s"Select one column", 3) { _ =>
+ ds.select($"col1").filter((row: Row) => true).count()
+ }
+
+ /*
+ Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz
+
+ Wide rows with 1000 columns: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
+
--------------------------------------------------------------------------------------------
+ Select 1000 columns 76910 / 78065 0.0
76909.8 1.0X
+ Select 100 columns 28625 / 32884 0.0
28625.1 2.7X
+ Select one column 22498 / 22669 0.0
22497.8 3.4X
--- End diff --
count(1) too?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]