This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push:
new 695be328b4c [SPARK-45106][SQL] `PercentileCont` should check user
supplied input
695be328b4c is described below
commit 695be328b4cfb4210ea8331fa7aedaa6a874fe65
Author: Bruce Robbins <[email protected]>
AuthorDate: Fri Sep 8 12:38:35 2023 -0700
[SPARK-45106][SQL] `PercentileCont` should check user supplied input
### What changes were proposed in this pull request?
Change `PercentileCont` to explicitly check user-supplied input by calling
`checkInputDataTypes` on the replacement.
### Why are the changes needed?
`PercentileCont` does not currently check the user's input. If the runtime
replacement (an instance of `Percentile`) rejects the user's input, the runtime
replacement ends up unresolved.
For example, this query throws an internal error rather than producing a
useful error message:
```
select percentile_cont(b) WITHIN GROUP (ORDER BY a DESC) as x
from (values (12, 0.25), (13, 0.25), (22, 0.25)) as (a, b);
[INTERNAL_ERROR] Cannot resolve the runtime replaceable expression
"percentile_cont(a, b)". The replacement is unresolved: "percentile(a, b, 1)".
org.apache.spark.SparkException: [INTERNAL_ERROR] Cannot resolve the
runtime replaceable expression "percentile_cont(a, b)". The replacement is
unresolved: "percentile(a, b, 1)".
at
org.apache.spark.SparkException$.internalError(SparkException.scala:92)
at
org.apache.spark.SparkException$.internalError(SparkException.scala:96)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$6(CheckAnalysis.scala:313)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$6$adapted(CheckAnalysis.scala:277)
...
```
With this PR, the above query will produce the following error message:
```
[DATATYPE_MISMATCH.NON_FOLDABLE_INPUT] Cannot resolve "percentile_cont(a,
b)" due to data type mismatch: the input percentage should be a foldable
"DOUBLE" expression; however, got "b".; line 1 pos 7;
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
New tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #42857 from bersprockets/pc_checkinputtype_issue.
Authored-by: Bruce Robbins <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 2b4387fe9498a7ba403f96f288e6c5b6660dba6e)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../expressions/aggregate/percentiles.scala | 5 ++++
.../sql-tests/analyzer-results/percentiles.sql.out | 25 ++++++++++++++++++++
.../resources/sql-tests/inputs/percentiles.sql | 4 ++++
.../sql-tests/results/percentiles.sql.out | 27 ++++++++++++++++++++++
4 files changed, 61 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
index 74d84829869..73e44fe91ba 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
@@ -368,6 +368,11 @@ case class PercentileCont(left: Expression, right:
Expression, reverse: Boolean
val direction = if (reverse) " DESC" else ""
s"$prettyName($distinct${right.sql}) WITHIN GROUP (ORDER BY
${left.sql}$direction)"
}
+
+ override def checkInputDataTypes(): TypeCheckResult = {
+ percentile.checkInputDataTypes()
+ }
+
override protected def withNewChildrenInternal(
newLeft: Expression, newRight: Expression): PercentileCont =
this.copy(left = newLeft, right = newRight)
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out
index 36845ce6346..16fb510d5d4 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out
@@ -868,6 +868,31 @@ Aggregate [percentile_disc(a#x, cast(0.0 as double),
false, 0, 0, true) AS p0#x,
+- LocalRelation [a#x]
+-- !query
+SELECT
+ percentile_cont(b) WITHIN GROUP (ORDER BY a DESC) as p0
+FROM values (12, 0.25), (13, 0.25), (22, 0.25) as v(a, b)
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+ "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
+ "sqlState" : "42K09",
+ "messageParameters" : {
+ "inputExpr" : "\"b\"",
+ "inputName" : "percentage",
+ "inputType" : "\"DOUBLE\"",
+ "sqlExpr" : "\"percentile_cont(a, b)\""
+ },
+ "queryContext" : [ {
+ "objectType" : "",
+ "objectName" : "",
+ "startIndex" : 10,
+ "stopIndex" : 58,
+ "fragment" : "percentile_cont(b) WITHIN GROUP (ORDER BY a DESC)"
+ } ]
+}
+
+
-- !query
SET spark.sql.legacy.percentileDiscCalculation = false
-- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql
b/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql
index 87c5d4be90c..eae8a71be7e 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql
@@ -374,4 +374,8 @@ SELECT
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10
FROM VALUES (0), (1), (2), (3), (4) AS v(a);
+SELECT
+ percentile_cont(b) WITHIN GROUP (ORDER BY a DESC) as p0
+FROM values (12, 0.25), (13, 0.25), (22, 0.25) as v(a, b);
+
SET spark.sql.legacy.percentileDiscCalculation = false;
diff --git a/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out
b/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out
index 0d6ab542861..a0a4dc35f3f 100644
--- a/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out
@@ -840,6 +840,33 @@
struct<p0:double,p1:double,p2:double,p3:double,p4:double,p5:double,p6:double,p7:
0.0 0.0 0.0 1.0 1.0 2.0 2.0 2.0 3.0 3.0
4.0
+-- !query
+SELECT
+ percentile_cont(b) WITHIN GROUP (ORDER BY a DESC) as p0
+FROM values (12, 0.25), (13, 0.25), (22, 0.25) as v(a, b)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+ "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
+ "sqlState" : "42K09",
+ "messageParameters" : {
+ "inputExpr" : "\"b\"",
+ "inputName" : "percentage",
+ "inputType" : "\"DOUBLE\"",
+ "sqlExpr" : "\"percentile_cont(a, b)\""
+ },
+ "queryContext" : [ {
+ "objectType" : "",
+ "objectName" : "",
+ "startIndex" : 10,
+ "stopIndex" : 58,
+ "fragment" : "percentile_cont(b) WITHIN GROUP (ORDER BY a DESC)"
+ } ]
+}
+
+
-- !query
SET spark.sql.legacy.percentileDiscCalculation = false
-- !query schema
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]