This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 76785cd [SPARK-27581][SQL] DataFrame countDistinct("*") shouldn't
fail with AnalysisException
76785cd is described below
commit 76785cd6f0d26825e9a79f831239633e953cef74
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Mon Apr 29 21:17:32 2019 +0800
[SPARK-27581][SQL] DataFrame countDistinct("*") shouldn't fail with
AnalysisException
## What changes were proposed in this pull request?
Currently `countDistinct("*")` doesn't work. An analysis exception is
thrown:
```scala
val df = sql("select id % 100 from range(100000)")
df.select(countDistinct("*")).first()
org.apache.spark.sql.AnalysisException: Invalid usage of '*' in expression
'count';
```
Users need to use `expr`.
```scala
df.select(expr("count(distinct(*))")).first()
```
This limits some API usage like `df.select(count("*"), countDistinct("*))`.
The PR takes the simplest fix that lets analyzer expand star and resolve
`count` function.
## How was this patch tested?
Added unit test.
Closes #24482 from viirya/SPARK-27581.
Authored-by: Liang-Chi Hsieh <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 7 ++++---
.../scala/org/apache/spark/sql/DataFrameAggregateSuite.scala | 10 ++++++++++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c1997b6..f92bf79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -358,9 +358,10 @@ object functions {
* @since 1.3.0
*/
@scala.annotation.varargs
- def countDistinct(expr: Column, exprs: Column*): Column = {
- withAggregateFunction(Count.apply((expr +: exprs).map(_.expr)), isDistinct
= true)
- }
+ def countDistinct(expr: Column, exprs: Column*): Column =
+ // For usage like countDistinct("*"), we should let analyzer expand star
and
+ // resolve function.
+ Column(UnresolvedFunction("count", (expr +: exprs).map(_.expr), isDistinct
= true))
/**
* Aggregate function: returns the number of distinct items in a group.
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 73259a0..97aaa1b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -772,4 +772,14 @@ class DataFrameAggregateSuite extends QueryTest with
SharedSQLContext {
Row(Seq(0.0f, 0.0f), Row(0.0d, Double.NaN), Seq(Row(0.0d, Double.NaN)),
2)
)
}
+
+ test("SPARK-27581: DataFrame countDistinct(\"*\") shouldn't fail with
AnalysisException") {
+ val df = sql("select id % 100 from range(100000)")
+ val distinctCount1 = df.select(expr("count(distinct(*))"))
+ val distinctCount2 = df.select(countDistinct("*"))
+ checkAnswer(distinctCount1, distinctCount2)
+
+ val countAndDistinct = df.select(count("*"), countDistinct("*"))
+ checkAnswer(countAndDistinct, Row(100000, 100))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]