This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new f0b421553bc [SPARK-45079][SQL] Fix an internal error from
`percentile_approx()`on `NULL` accuracy
f0b421553bc is described below
commit f0b421553bc1850cc3e8ed5d564da8f6425cd244
Author: Max Gekk <[email protected]>
AuthorDate: Wed Sep 6 10:32:37 2023 +0300
[SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on
`NULL` accuracy
### What changes were proposed in this pull request?
In the PR, I propose to check the `accuracy` argument is not a NULL in
`ApproximatePercentile`. And if it is, throw an `AnalysisException` with new
error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`.
### Why are the changes needed?
To fix the issue demonstrated by the example:
```sql
$ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1),
NULL) FROM VALUES (0), (1), (2), (10) AS tab(col);
[INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal
error. You hit a bug in Spark or the Spark plugins you use. Please, report this
bug to the corresponding communities or vendors, and provide the full stack
trace.
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
By running new test:
```
$ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite"
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #42817 from MaxGekk/fix-internal-error-in-percentile_approx.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
(cherry picked from commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b)
Signed-off-by: Max Gekk <[email protected]>
---
.../aggregate/ApproximatePercentile.scala | 7 ++++-
.../sql/ApproximatePercentileQuerySuite.scala | 31 ++++++++++++++++++++++
2 files changed, 37 insertions(+), 1 deletion(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 1499f358ac4..ebf1085c0c1 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -96,7 +96,8 @@ case class ApproximatePercentile(
}
// Mark as lazy so that accuracyExpression is not evaluated during tree
transformation.
- private lazy val accuracy: Long =
accuracyExpression.eval().asInstanceOf[Number].longValue
+ private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number]
+ private lazy val accuracy: Long = accuracyNum.longValue
override def inputTypes: Seq[AbstractDataType] = {
// Support NumericType, DateType, TimestampType and TimestampNTZType since
their internal types
@@ -137,6 +138,10 @@ case class ApproximatePercentile(
"inputExpr" -> toSQLExpr(accuracyExpression)
)
)
+ } else if (accuracyNum == null) {
+ DataTypeMismatch(
+ errorSubClass = "UNEXPECTED_NULL",
+ messageParameters = Map("exprName" -> "accuracy"))
} else if (accuracy <= 0 || accuracy > Int.MaxValue) {
DataTypeMismatch(
errorSubClass = "VALUE_OUT_OF_RANGE",
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
index 9237c9e9486..8598e92f029 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
@@ -337,4 +337,35 @@ class ApproximatePercentileQuerySuite extends QueryTest
with SharedSparkSession
Row(Period.ofMonths(200).normalized(), null,
Duration.ofSeconds(200L)))
}
}
+
+ test("SPARK-45079: NULL arguments of percentile_approx") {
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(
+ """
+ |SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL)
+ |FROM VALUES (0), (1), (2), (10) AS tab(col);
+ |""".stripMargin).collect()
+ },
+ errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL",
+ parameters = Map(
+ "exprName" -> "accuracy",
+ "sqlExpr" -> "\"percentile_approx(col, array(0.5, 0.4, 0.1), NULL)\""),
+ context = ExpectedContext(
+ "", "", 8, 57, "percentile_approx(col, array(0.5, 0.4, 0.1), NULL)"))
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(
+ """
+ |SELECT percentile_approx(col, NULL, 100)
+ |FROM VALUES (0), (1), (2), (10) AS tab(col);
+ |""".stripMargin).collect()
+ },
+ errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL",
+ parameters = Map(
+ "exprName" -> "percentage",
+ "sqlExpr" -> "\"percentile_approx(col, NULL, 100)\""),
+ context = ExpectedContext(
+ "", "", 8, 40, "percentile_approx(col, NULL, 100)"))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]