This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new cf02b1a60d9d [SPARK-47569][SQL] Disallow comparing variant cf02b1a60d9d is described below commit cf02b1a60d9d605e5429fc18a7b7823a4d02a3dd Author: Chenhao Li <chenhao...@databricks.com> AuthorDate: Mon Apr 1 15:42:02 2024 +0800 [SPARK-47569][SQL] Disallow comparing variant ### What changes were proposed in this pull request? It adds type-checking rules to disallow comparing variant values (including group by a variant column). We may support comparing variant values in the future, but since we don't have a proper comparison implementation at this point, they should be disallowed on the user surface. ### How was this patch tested? Unit tests. Closes #45726 from chenhao-db/SPARK-47569. Authored-by: Chenhao Li <chenhao...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../src/main/resources/error/error-classes.json | 6 +++++ docs/sql-error-conditions.md | 6 +++++ .../sql/catalyst/expressions/OrderUtils.scala | 3 ++- .../spark/sql/catalyst/expressions/ExprUtils.scala | 11 +++++++- .../scala/org/apache/spark/sql/VariantSuite.scala | 29 ++++++++++++++++++++++ 5 files changed, 53 insertions(+), 2 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 11c8204d2c93..821aa2615ee2 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -1390,6 +1390,12 @@ ], "sqlState" : "42805" }, + "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE" : { + "message" : [ + "The expression <sqlExpr> cannot be used as a grouping expression because its data type <dataType> is not an orderable data type." + ], + "sqlState" : "42822" + }, "HLL_INVALID_INPUT_SKETCH_BUFFER" : { "message" : [ "Invalid call to <function>; only valid HLL sketch buffers are supported as inputs (such as those produced by the `hll_sketch_agg` function)." diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index 85b9e85ac420..e9c17bf4f93b 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -846,6 +846,12 @@ GROUP BY `<index>` refers to an expression `<aggExpr>` that contains an aggregat GROUP BY position `<index>` is not in select list (valid range is [1, `<size>`]). +### GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE + +[SQLSTATE: 42822](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) + +The expression `<sqlExpr>` cannot be used as a grouping expression because its data type `<dataType>` is not an orderable data type. + ### HLL_INVALID_INPUT_SKETCH_BUFFER [SQLSTATE: 22546](sql-error-conditions-sqlstates.html#class-22-data-exception) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala index 9319b104024a..385e0f00695a 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, NullType, StructType, UserDefinedType} +import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, NullType, StructType, UserDefinedType, VariantType} object OrderUtils { /** @@ -24,6 +24,7 @@ object OrderUtils { */ def isOrderable(dataType: DataType): Boolean = dataType match { case NullType => true + case VariantType => false case dt: AtomicType => true case struct: StructType => struct.fields.forall(f => isOrderable(f.dataType)) case array: ArrayType => isOrderable(array.elementType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala index eaf10973e71d..258bc0ed8fe7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.Aggregate import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CharVarcharUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase, QueryExecutionErrors} -import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType} +import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType, VariantType} import org.apache.spark.unsafe.types.UTF8String object ExprUtils extends QueryErrorsBase { @@ -193,6 +193,15 @@ object ExprUtils extends QueryErrorsBase { messageParameters = Map("sqlExpr" -> expr.sql)) } + // Check if the data type of expr is orderable. + if (expr.dataType.existsRecursively(_.isInstanceOf[VariantType])) { + expr.failAnalysis( + errorClass = "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE", + messageParameters = Map( + "sqlExpr" -> toSQLExpr(expr), + "dataType" -> toSQLType(expr.dataType))) + } + if (!expr.deterministic) { // This is just a sanity check, our analysis rule PullOutNondeterministic should // already pull out those nondeterministic expressions and evaluate them in diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala index b9926625b1de..4f82dbc90dc5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala @@ -269,4 +269,33 @@ class VariantSuite extends QueryTest with SharedSparkSession { } } } + + test("group/order/join variant are disabled") { + var ex = intercept[AnalysisException] { + spark.sql("select parse_json('') group by 1") + } + assert(ex.getErrorClass == "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE") + + ex = intercept[AnalysisException] { + spark.sql("select parse_json('') order by 1") + } + assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE") + + ex = intercept[AnalysisException] { + spark.sql("select parse_json('') sort by 1") + } + assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE") + + ex = intercept[AnalysisException] { + spark.sql("with t as (select 1 as a, parse_json('') as v) " + + "select rank() over (partition by a order by v) from t") + } + assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE") + + ex = intercept[AnalysisException] { + spark.sql("with t as (select parse_json('') as v) " + + "select t1.v from t as t1 join t as t2 on t1.v = t2.v") + } + assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE") + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org