This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new ee24847ad10 [SPARK-39427][SQL] Disable ANSI intervals in the
percentile functions
ee24847ad10 is described below
commit ee24847ad100139628a9bffe45f711bdebaa0170
Author: Max Gekk <[email protected]>
AuthorDate: Thu Jun 9 15:24:11 2022 +0300
[SPARK-39427][SQL] Disable ANSI intervals in the percentile functions
### What changes were proposed in this pull request?
In the PR, I propose to don't support ANSI intervals by the percentile
functions, and remove the YearMonthIntervalType and DayTimeIntervalType types
from the list of input types. I propose to properly support ANSI intervals and
enable them back after that.
### Why are the changes needed?
To don't confuse users by results of the percentile functions when inputs
are ANSI intervals. At the moment, the functions return DOUBLE (or ARRAY OF
DAUBLE) type independently from inputs. In the case of ANSI intervals, the
functions should return ANSI interval too.
### Does this PR introduce _any_ user-facing change?
No, since the functions haven't released yet.
### How was this patch tested?
By running affected test suites:
```
$ build/sbt "sql/test:testOnly
org.apache.spark.sql.expressions.ExpressionInfoSuite"
$ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite"
$ build/sbt "test:testOnly *PercentileSuite"
$ build/sbt "test:testOnly *PercentileQuerySuite"
```
and checked manually that ANSI intervals are not supported as input types:
```sql
spark-sql> SELECT percentile(col, 0.5) FROM VALUES (INTERVAL '0' MONTH),
(INTERVAL '10' MONTH) AS tab(col);
Error in query: cannot resolve 'percentile(tab.col, CAST(0.5BD AS DOUBLE),
1L)' due to data type mismatch: argument 1 requires numeric type, however,
'tab.col' is of interval month type.; line 1 pos 7;
```
Closes #36817 from MaxGekk/percentile-disable-ansi-interval.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../sql/catalyst/expressions/aggregate/percentiles.scala | 15 ++++-----------
.../catalyst/expressions/aggregate/PercentileSuite.scala | 8 ++++----
.../org/apache/spark/sql/PercentileQuerySuite.scala | 16 ++++++++--------
3 files changed, 16 insertions(+), 23 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
index 3acb2f2cc97..0efe09bca2e 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
@@ -67,8 +67,7 @@ abstract class PercentileBase extends
TypedImperativeAggregate[OpenHashMap[AnyRe
case _: ArrayType => ArrayType(DoubleType, false)
case _ => DoubleType
}
- Seq(TypeCollection(NumericType, YearMonthIntervalType,
DayTimeIntervalType),
- percentageExpType, IntegralType)
+ Seq(NumericType, percentageExpType, IntegralType)
}
// Check the inputTypes are valid, and the percentageExpression satisfies:
@@ -288,7 +287,7 @@ abstract class PercentileBase extends
TypedImperativeAggregate[OpenHashMap[AnyRe
usage =
"""
_FUNC_(col, percentage [, frequency]) - Returns the exact percentile
value of numeric
- or ansi interval column `col` at the given percentage. The value of
percentage must be
+ column `col` at the given percentage. The value of percentage must be
between 0.0 and 1.0. The value of frequency should be positive integral
_FUNC_(col, array(percentage1 [, percentage2]...) [, frequency]) -
Returns the exact
@@ -303,10 +302,6 @@ abstract class PercentileBase extends
TypedImperativeAggregate[OpenHashMap[AnyRe
3.0
> SELECT _FUNC_(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS
tab(col);
[2.5,7.5]
- > SELECT _FUNC_(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL
'10' MONTH) AS tab(col);
- 5.0
- > SELECT _FUNC_(col, array(0.2, 0.5)) FROM VALUES (INTERVAL '0' SECOND),
(INTERVAL '10' SECOND) AS tab(col);
- [2000000.0,5000000.0]
""",
group = "agg_funcs",
since = "2.1.0")
@@ -360,13 +355,11 @@ case class Percentile(
}
@ExpressionDescription(
- usage = "_FUNC_(col) - Returns the median of numeric or ansi interval column
`col`.",
+ usage = "_FUNC_(col) - Returns the median of numeric column `col`.",
examples = """
Examples:
> SELECT _FUNC_(col) FROM VALUES (0), (10) AS tab(col);
5.0
- > SELECT _FUNC_(col) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10'
MONTH) AS tab(col);
- 5.0
""",
group = "agg_funcs",
since = "3.4.0")
@@ -385,7 +378,7 @@ case class Median(child: Expression)
/**
* Return a percentile value based on a continuous distribution of
- * numeric or ansi interval column at the given percentage (specified in ORDER
BY clause).
+ * numeric column at the given percentage (specified in ORDER BY clause).
* The value of percentage must be between 0.0 and 1.0.
*/
case class PercentileCont(left: Expression, right: Expression, reverse:
Boolean = false)
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
index e08c6279fd6..a3ae7a44a30 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
@@ -170,8 +170,8 @@ class PercentileSuite extends SparkFunSuite {
val child = AttributeReference("a", dataType)()
val percentile = new Percentile(child, percentage)
assertEqual(percentile.checkInputDataTypes(),
- TypeCheckFailure(s"argument 1 requires (numeric or interval year to
month or " +
- s"interval day to second) type, however, 'a' is of
${dataType.simpleString} type."))
+ TypeCheckFailure(s"argument 1 requires numeric type," +
+ s" however, 'a' is of ${dataType.simpleString} type."))
}
val invalidFrequencyDataTypes = Seq(FloatType, DoubleType, BooleanType,
@@ -184,8 +184,8 @@ class PercentileSuite extends SparkFunSuite {
val frq = AttributeReference("frq", frequencyType)()
val percentile = new Percentile(child, percentage, frq)
assertEqual(percentile.checkInputDataTypes(),
- TypeCheckFailure(s"argument 1 requires (numeric or interval year to
month or " +
- s"interval day to second) type, however, 'a' is of
${dataType.simpleString} type."))
+ TypeCheckFailure(s"argument 1 requires numeric type," +
+ s" however, 'a' is of ${dataType.simpleString} type."))
}
for(dataType <- validDataTypes;
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
index f39f0c18024..823c1375de0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
@@ -29,21 +29,21 @@ class PercentileQuerySuite extends QueryTest with
SharedSparkSession {
private val table = "percentile_test"
- test("SPARK-37138: Support Ansi Interval type in Percentile") {
+ test("SPARK-37138, SPARK-39427: Disable Ansi Interval type in Percentile") {
withTempView(table) {
Seq((Period.ofMonths(100), Duration.ofSeconds(100L)),
(Period.ofMonths(200), Duration.ofSeconds(200L)),
(Period.ofMonths(300), Duration.ofSeconds(300L)))
.toDF("col1", "col2").createOrReplaceTempView(table)
- checkAnswer(
+ val e = intercept[AnalysisException] {
spark.sql(
s"""SELECT
- | CAST(percentile(col1, 0.5) AS STRING),
- | SUM(null),
- | CAST(percentile(col2, 0.5) AS STRING)
- |FROM $table
- """.stripMargin),
- Row("200.0", null, "2.0E8"))
+ | CAST(percentile(col1, 0.5) AS STRING),
+ | SUM(null),
+ | CAST(percentile(col2, 0.5) AS STRING)
+ |FROM $table""".stripMargin).collect()
+ }
+ assert(e.getMessage.contains("data type mismatch"))
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]