This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 98f0d9f32322 [SPARK-49605][SQL] Fix the prompt when `ascendingOrder`
is `DataTypeMismatch` in `SortArray`
98f0d9f32322 is described below
commit 98f0d9f32322074b01285f405c86df29997634a3
Author: panbingkun <[email protected]>
AuthorDate: Thu Sep 12 18:52:43 2024 +0200
[SPARK-49605][SQL] Fix the prompt when `ascendingOrder` is
`DataTypeMismatch` in `SortArray`
### What changes were proposed in this pull request?
The pr aims to fix the `prompt` when `ascendingOrder` is `DataTypeMismatch`
in `SortArray`.
### Why are the changes needed?
- Give an example with the following code:
```scala
val df = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int],
false)).toDF("a", "b")
df.selectExpr("sort_array(a, b)").collect()
```
- Before:
```scala
scala> val df = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int],
false)).toDF("a", "b")
val df: org.apache.spark.sql.DataFrame = [a: array<int>, b: boolean]
scala> df.selectExpr("sort_array(a, b)").collect()
org.apache.spark.sql.catalyst.ExtendedAnalysisException:
[DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "sort_array(a, b)" due
to data type mismatch: The second parameter requires the "BOOLEAN" type,
however "b" has the type "BOOLEAN". SQLSTATE: 42K09; line 1 pos 0;
'Project [unresolvedalias(sort_array(a#7, b#8))]
+- Project [_1#2 AS a#7, _2#3 AS b#8]
+- LocalRelation [_1#2, _2#3]
at
org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7(CheckAnalysis.scala:331)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7$adapted(CheckAnalysis.scala:313)
```
<img width="1394" alt="image"
src="https://github.com/user-attachments/assets/c0eea384-af29-42c1-9ee5-c65310de6070">
Obviously, this error message is `incorrect` and `confusing`. Through the
following code:
https://github.com/apache/spark/blob/8023504e69fdd037dea002e961b960fd9fa662ba/sql/api/src/main/scala/org/apache/spark/sql/functions.scala#L7176-L7195
we found that it actually requires `ascendingOrder` to be `foldable` and
the data type to be `BooleanType`.
- After:
```
scala> val df = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int],
false)).toDF("a", "b")
val df: org.apache.spark.sql.DataFrame = [a: array<int>, b: boolean]
scala> df.selectExpr("sort_array(a, b)").collect()
org.apache.spark.sql.catalyst.ExtendedAnalysisException:
[DATATYPE_MISMATCH.NON_FOLDABLE_INPUT] Cannot resolve "sort_array(a, b)" due to
data type mismatch: the input `ascendingOrder` should be a foldable "BOOLEAN"
expression; however, got "b". SQLSTATE: 42K09; line 1 pos 0;
'Project [unresolvedalias(sort_array(a#7, b#8))]
+- Project [_1#2 AS a#7, _2#3 AS b#8]
+- LocalRelation [_1#2, _2#3]
at
org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7(CheckAnalysis.scala:331)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7$adapted(CheckAnalysis.scala:313)
```
<img width="1396" alt="image"
src="https://github.com/user-attachments/assets/2c173aab-52b8-4794-8ef0-d14ae269aadc">
### Does this PR introduce _any_ user-facing change?
Yes, When the value `ascendingOrder` in `SortArray` is `DataTypeMismatch`,
the prompt is more `accurate`.
### How was this patch tested?
- Add new UT
- Pass GA.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #48082 from panbingkun/SPARK-49605.
Authored-by: panbingkun <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../expressions/collectionOperations.scala | 32 +++++++++++++---------
.../sql-tests/analyzer-results/ansi/array.sql.out | 21 ++------------
.../sql-tests/analyzer-results/array.sql.out | 21 ++------------
.../resources/sql-tests/results/ansi/array.sql.out | 22 ++-------------
.../test/resources/sql-tests/results/array.sql.out | 22 ++-------------
.../apache/spark/sql/DataFrameFunctionsSuite.scala | 30 ++++++++++++++++++++
6 files changed, 57 insertions(+), 91 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 5d5aece35383..5cdd3c7eb62d 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -1058,20 +1058,26 @@ case class SortArray(base: Expression, ascendingOrder:
Expression)
override def checkInputDataTypes(): TypeCheckResult = base.dataType match {
case ArrayType(dt, _) if RowOrdering.isOrderable(dt) =>
- ascendingOrder match {
- case Literal(_: Boolean, BooleanType) =>
- TypeCheckResult.TypeCheckSuccess
- case _ =>
- DataTypeMismatch(
- errorSubClass = "UNEXPECTED_INPUT_TYPE",
- messageParameters = Map(
- "paramIndex" -> ordinalNumber(1),
- "requiredType" -> toSQLType(BooleanType),
- "inputSql" -> toSQLExpr(ascendingOrder),
- "inputType" -> toSQLType(ascendingOrder.dataType))
- )
+ if (!ascendingOrder.foldable) {
+ DataTypeMismatch(
+ errorSubClass = "NON_FOLDABLE_INPUT",
+ messageParameters = Map(
+ "inputName" -> toSQLId("ascendingOrder"),
+ "inputType" -> toSQLType(ascendingOrder.dataType),
+ "inputExpr" -> toSQLExpr(ascendingOrder)))
+ } else if (ascendingOrder.dataType != BooleanType) {
+ DataTypeMismatch(
+ errorSubClass = "UNEXPECTED_INPUT_TYPE",
+ messageParameters = Map(
+ "paramIndex" -> ordinalNumber(1),
+ "requiredType" -> toSQLType(BooleanType),
+ "inputSql" -> toSQLExpr(ascendingOrder),
+ "inputType" -> toSQLType(ascendingOrder.dataType))
+ )
+ } else {
+ TypeCheckResult.TypeCheckSuccess
}
- case ArrayType(dt, _) =>
+ case ArrayType(_, _) =>
DataTypeMismatch(
errorSubClass = "INVALID_ORDERING_TYPE",
messageParameters = Map(
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
index 57108c4582f4..53595d1b8a3e 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
@@ -194,25 +194,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select sort_array(array('b', 'd'), cast(NULL as boolean))
-- !query analysis
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
-{
- "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
- "sqlState" : "42K09",
- "messageParameters" : {
- "inputSql" : "\"CAST(NULL AS BOOLEAN)\"",
- "inputType" : "\"BOOLEAN\"",
- "paramIndex" : "second",
- "requiredType" : "\"BOOLEAN\"",
- "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\""
- },
- "queryContext" : [ {
- "objectType" : "",
- "objectName" : "",
- "startIndex" : 8,
- "stopIndex" : 57,
- "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))"
- } ]
-}
+Project [sort_array(array(b, d), cast(null as boolean)) AS sort_array(array(b,
d), CAST(NULL AS BOOLEAN))#x]
++- OneRowRelation
-- !query
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
index fb331089d754..4db56d6c7056 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
@@ -194,25 +194,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select sort_array(array('b', 'd'), cast(NULL as boolean))
-- !query analysis
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
-{
- "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
- "sqlState" : "42K09",
- "messageParameters" : {
- "inputSql" : "\"CAST(NULL AS BOOLEAN)\"",
- "inputType" : "\"BOOLEAN\"",
- "paramIndex" : "second",
- "requiredType" : "\"BOOLEAN\"",
- "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\""
- },
- "queryContext" : [ {
- "objectType" : "",
- "objectName" : "",
- "startIndex" : 8,
- "stopIndex" : 57,
- "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))"
- } ]
-}
+Project [sort_array(array(b, d), cast(null as boolean)) AS sort_array(array(b,
d), CAST(NULL AS BOOLEAN))#x]
++- OneRowRelation
-- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
index d17d87900fc7..7394e428091c 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
@@ -151,27 +151,9 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select sort_array(array('b', 'd'), cast(NULL as boolean))
-- !query schema
-struct<>
+struct<sort_array(array(b, d), CAST(NULL AS BOOLEAN)):array<string>>
-- !query output
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
-{
- "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
- "sqlState" : "42K09",
- "messageParameters" : {
- "inputSql" : "\"CAST(NULL AS BOOLEAN)\"",
- "inputType" : "\"BOOLEAN\"",
- "paramIndex" : "second",
- "requiredType" : "\"BOOLEAN\"",
- "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\""
- },
- "queryContext" : [ {
- "objectType" : "",
- "objectName" : "",
- "startIndex" : 8,
- "stopIndex" : 57,
- "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))"
- } ]
-}
+NULL
-- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out
b/sql/core/src/test/resources/sql-tests/results/array.sql.out
index 92da0a490ff8..c1330c620acf 100644
--- a/sql/core/src/test/resources/sql-tests/results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out
@@ -151,27 +151,9 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
select sort_array(array('b', 'd'), cast(NULL as boolean))
-- !query schema
-struct<>
+struct<sort_array(array(b, d), CAST(NULL AS BOOLEAN)):array<string>>
-- !query output
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
-{
- "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
- "sqlState" : "42K09",
- "messageParameters" : {
- "inputSql" : "\"CAST(NULL AS BOOLEAN)\"",
- "inputType" : "\"BOOLEAN\"",
- "paramIndex" : "second",
- "requiredType" : "\"BOOLEAN\"",
- "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\""
- },
- "queryContext" : [ {
- "objectType" : "",
- "objectName" : "",
- "startIndex" : 8,
- "stopIndex" : 57,
- "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))"
- } ]
-}
+NULL
-- !query
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index d488adc5ac3d..f16171940df2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -964,6 +964,36 @@ class DataFrameFunctionsSuite extends QueryTest with
SharedSparkSession {
queryContext = Array(ExpectedContext("", "", 0, 12, "sort_array(a)"))
)
+ val df4 = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int],
false)).toDF("a", "b")
+ checkError(
+ exception = intercept[AnalysisException] {
+ df4.selectExpr("sort_array(a, b)").collect()
+ },
+ condition = "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
+ sqlState = "42K09",
+ parameters = Map(
+ "inputName" -> "`ascendingOrder`",
+ "inputType" -> "\"BOOLEAN\"",
+ "inputExpr" -> "\"b\"",
+ "sqlExpr" -> "\"sort_array(a, b)\""),
+ context = ExpectedContext(fragment = "sort_array(a, b)", start = 0, stop
= 15)
+ )
+
+ checkError(
+ exception = intercept[AnalysisException] {
+ df4.selectExpr("sort_array(a, 'A')").collect()
+ },
+ condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+ sqlState = "42K09",
+ parameters = Map(
+ "sqlExpr" -> "\"sort_array(a, A)\"",
+ "paramIndex" -> "second",
+ "inputSql" -> "\"A\"",
+ "inputType" -> "\"STRING\"",
+ "requiredType" -> "\"BOOLEAN\""),
+ context = ExpectedContext(fragment = "sort_array(a, 'A')", start = 0,
stop = 17)
+ )
+
checkAnswer(
df.select(array_sort($"a"), array_sort($"b")),
Seq(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]