Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/21623#discussion_r199080962
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
---
@@ -140,6 +142,32 @@ class ParquetFilterSuite extends QueryTest with
ParquetTest with SharedSQLContex
checkBinaryFilterPredicate(predicate, filterClass,
Seq(Row(expected)))(df)
}
+ // This function tests that exactly go through the `canDrop` and
`inverseCanDrop`.
+ private def testStringStartsWith(dataFrame: DataFrame, filter: String):
Unit = {
+ withTempPath { dir =>
+ val path = dir.getCanonicalPath
+ dataFrame.write.option("parquet.block.size", 512).parquet(path)
+ Seq(true, false).foreach { pushDown =>
+ withSQLConf(
+ SQLConf.PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED.key ->
pushDown.toString) {
+ val accu = new NumRowGroupsAcc
+ sparkContext.register(accu)
+
+ val df = spark.read.parquet(path).filter(filter)
+ df.foreachPartition((it: Iterator[Row]) => it.foreach(v =>
accu.add(0)))
+ df.collect
--- End diff --
what does this `collect` do? `foreachPartition` is already an action
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]