Github user gatorsmile commented on a diff in the pull request:
https://github.com/apache/spark/pull/20201#discussion_r162732939
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
---
@@ -95,6 +96,34 @@ class DataSourceV2Suite extends QueryTest with
SharedSQLContext {
}
}
+ test("partitioning reporting") {
+ import org.apache.spark.sql.functions.{count, sum}
+ Seq(classOf[PartitionAwareDataSource],
classOf[JavaPartitionAwareDataSource]).foreach { cls =>
+ withClue(cls.getName) {
+ val df = spark.read.format(cls.getName).load()
+ checkAnswer(df, Seq(Row(1, 4), Row(1, 4), Row(3, 6), Row(2, 6),
Row(4, 2), Row(4, 2)))
+
+ val groupByColA = df.groupBy('a).agg(sum('b))
+ checkAnswer(groupByColA, Seq(Row(1, 8), Row(2, 6), Row(3, 6),
Row(4, 4)))
+ assert(groupByColA.queryExecution.executedPlan.collectFirst {
+ case e: ShuffleExchangeExec => e
+ }.isEmpty)
+
+ val groupByColAB = df.groupBy('a, 'b).agg(count("*"))
--- End diff --
Try `df.groupBy('a + 'b).agg(count("*")).show()`
At least, it should not fail, even if we do not support complex
`ClusteredDistribution` expressions
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]