(spark) branch master updated: [SPARK-47569][SQL] Disallow comparing variant

wenchen Mon, 01 Apr 2024 00:42:19 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new cf02b1a60d9d [SPARK-47569][SQL] Disallow comparing variant
cf02b1a60d9d is described below

commit cf02b1a60d9d605e5429fc18a7b7823a4d02a3dd
Author: Chenhao Li <chenhao...@databricks.com>
AuthorDate: Mon Apr 1 15:42:02 2024 +0800

    [SPARK-47569][SQL] Disallow comparing variant
    
    ### What changes were proposed in this pull request?
    
    It adds type-checking rules to disallow comparing variant values (including 
group by a variant column). We may support comparing variant values in the 
future, but since we don't have a proper comparison implementation at this 
point, they should be disallowed on the user surface.
    
    ### How was this patch tested?
    
    Unit tests.
    
    Closes #45726 from chenhao-db/SPARK-47569.
    
    Authored-by: Chenhao Li <chenhao...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../src/main/resources/error/error-classes.json    |  6 +++++
 docs/sql-error-conditions.md                       |  6 +++++
 .../sql/catalyst/expressions/OrderUtils.scala      |  3 ++-
 .../spark/sql/catalyst/expressions/ExprUtils.scala | 11 +++++++-
 .../scala/org/apache/spark/sql/VariantSuite.scala  | 29 ++++++++++++++++++++++
 5 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-classes.json 
b/common/utils/src/main/resources/error/error-classes.json
index 11c8204d2c93..821aa2615ee2 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -1390,6 +1390,12 @@
     ],
     "sqlState" : "42805"
   },
+  "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE" : {
+    "message" : [
+      "The expression <sqlExpr> cannot be used as a grouping expression 
because its data type <dataType> is not an orderable data type."
+    ],
+    "sqlState" : "42822"
+  },
   "HLL_INVALID_INPUT_SKETCH_BUFFER" : {
     "message" : [
       "Invalid call to <function>; only valid HLL sketch buffers are supported 
as inputs (such as those produced by the `hll_sketch_agg` function)."
diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
index 85b9e85ac420..e9c17bf4f93b 100644
--- a/docs/sql-error-conditions.md
+++ b/docs/sql-error-conditions.md
@@ -846,6 +846,12 @@ GROUP BY `<index>` refers to an expression `<aggExpr>` 
that contains an aggregat
 
 GROUP BY position `<index>` is not in select list (valid range is [1, 
`<size>`]).
 
+### GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE
+
+[SQLSTATE: 
42822](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
+
+The expression `<sqlExpr>` cannot be used as a grouping expression because its 
data type `<dataType>` is not an orderable data type.
+
 ### HLL_INVALID_INPUT_SKETCH_BUFFER
 
 [SQLSTATE: 22546](sql-error-conditions-sqlstates.html#class-22-data-exception)
diff --git 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala
 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala
index 9319b104024a..385e0f00695a 100644
--- 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala
+++ 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, NullType, 
StructType, UserDefinedType}
+import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, NullType, 
StructType, UserDefinedType, VariantType}
 
 object OrderUtils {
   /**
@@ -24,6 +24,7 @@ object OrderUtils {
    */
   def isOrderable(dataType: DataType): Boolean = dataType match {
     case NullType => true
+    case VariantType => false
     case dt: AtomicType => true
     case struct: StructType => struct.fields.forall(f => 
isOrderable(f.dataType))
     case array: ArrayType => isOrderable(array.elementType)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala
index eaf10973e71d..258bc0ed8fe7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala
@@ -28,7 +28,7 @@ import 
org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CharVarcharUtils}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase, 
QueryExecutionErrors}
-import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType}
+import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType, 
VariantType}
 import org.apache.spark.unsafe.types.UTF8String
 
 object ExprUtils extends QueryErrorsBase {
@@ -193,6 +193,15 @@ object ExprUtils extends QueryErrorsBase {
           messageParameters = Map("sqlExpr" -> expr.sql))
       }
 
+      // Check if the data type of expr is orderable.
+      if (expr.dataType.existsRecursively(_.isInstanceOf[VariantType])) {
+        expr.failAnalysis(
+          errorClass = "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE",
+          messageParameters = Map(
+            "sqlExpr" -> toSQLExpr(expr),
+            "dataType" -> toSQLType(expr.dataType)))
+      }
+
       if (!expr.deterministic) {
         // This is just a sanity check, our analysis rule 
PullOutNondeterministic should
         // already pull out those nondeterministic expressions and evaluate 
them in
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala
index b9926625b1de..4f82dbc90dc5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala
@@ -269,4 +269,33 @@ class VariantSuite extends QueryTest with 
SharedSparkSession {
       }
     }
   }
+
+  test("group/order/join variant are disabled") {
+    var ex = intercept[AnalysisException] {
+      spark.sql("select parse_json('') group by 1")
+    }
+    assert(ex.getErrorClass == "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE")
+
+    ex = intercept[AnalysisException] {
+      spark.sql("select parse_json('') order by 1")
+    }
+    assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
+
+    ex = intercept[AnalysisException] {
+      spark.sql("select parse_json('') sort by 1")
+    }
+    assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
+
+    ex = intercept[AnalysisException] {
+      spark.sql("with t as (select 1 as a, parse_json('') as v) " +
+        "select rank() over (partition by a order by v) from t")
+    }
+    assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
+
+    ex = intercept[AnalysisException] {
+      spark.sql("with t as (select parse_json('') as v) " +
+        "select t1.v from t as t1 join t as t2 on t1.v = t2.v")
+    }
+    assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-47569][SQL] Disallow comparing variant

Reply via email to