This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 4e754f778fdc [SPARK-47822][SQL] Prohibit Hash Expressions from hashing 
the Variant Data Type
4e754f778fdc is described below

commit 4e754f778fdc9628bc8af671553f2d85ce8ac32d
Author: Harsh Motwani <harsh.motw...@databricks.com>
AuthorDate: Wed Apr 17 15:57:17 2024 +0800

    [SPARK-47822][SQL] Prohibit Hash Expressions from hashing the Variant Data 
Type
    
    ### What changes were proposed in this pull request?
    
    I am prohibiting hash functions from hashing VariantType elements.
    
    ### Why are the changes needed?
    
    Hashing hasn't been formally implemented on VariantType elements so the 
current implementation crashes during execution.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Earlier, when trying to hash Variant data, Spark would crash during 
execution. Now, the query itself wouldn't compile.
    
    ### How was this patch tested?
    
    Additional unit test in ExpressionTypeCheckingSuite
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #46017 from harshmotw-db/hash_variant.
    
    Authored-by: Harsh Motwani <harsh.motw...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 common/utils/src/main/resources/error/error-conditions.json   |  5 +++++
 docs/sql-error-conditions-datatype-mismatch-error-class.md    |  4 ++++
 .../org/apache/spark/sql/catalyst/expressions/hash.scala      |  8 ++++++++
 .../sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala   | 11 +++++++++++
 4 files changed, 28 insertions(+)

diff --git a/common/utils/src/main/resources/error/error-conditions.json 
b/common/utils/src/main/resources/error/error-conditions.json
index 9636ddbf73bc..54415f80fee0 100644
--- a/common/utils/src/main/resources/error/error-conditions.json
+++ b/common/utils/src/main/resources/error/error-conditions.json
@@ -736,6 +736,11 @@
           "Input to the function <functionName> cannot contain elements of the 
\"MAP\" type. In Spark, same maps may have different hashcode, thus hash 
expressions are prohibited on \"MAP\" elements. To restore previous behavior 
set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"."
         ]
       },
+      "HASH_VARIANT_TYPE" : {
+        "message" : [
+          "Input to the function <functionName> cannot contain elements of the 
\"VARIANT\" type yet."
+        ]
+      },
       "INPUT_SIZE_NOT_ONE" : {
         "message" : [
           "Length of <exprName> should be 1."
diff --git a/docs/sql-error-conditions-datatype-mismatch-error-class.md 
b/docs/sql-error-conditions-datatype-mismatch-error-class.md
index 1d18836ac9e7..971319e3e0fe 100644
--- a/docs/sql-error-conditions-datatype-mismatch-error-class.md
+++ b/docs/sql-error-conditions-datatype-mismatch-error-class.md
@@ -100,6 +100,10 @@ Filter expression `<filter>` of type `<type>` is not a 
boolean.
 
 Input to the function `<functionName>` cannot contain elements of the "MAP" 
type. In Spark, same maps may have different hashcode, thus hash expressions 
are prohibited on "MAP" elements. To restore previous behavior set 
"spark.sql.legacy.allowHashOnMapType" to "true".
 
+## HASH_VARIANT_TYPE
+
+Input to the function `<functionName>` cannot contain elements of the 
"VARIANT" type yet.
+
 ## INPUT_SIZE_NOT_ONE
 
 Length of `<exprName>` should be 1.
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 436efa892416..5089cea136a8 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -271,6 +271,10 @@ abstract class HashExpression[E] extends Expression {
     dt.existsRecursively(_.isInstanceOf[MapType])
   }
 
+  private def hasVariantType(dt: DataType): Boolean = {
+    dt.existsRecursively(_.isInstanceOf[VariantType])
+  }
+
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.length < 1) {
       throw QueryCompilationErrors.wrongNumArgsError(
@@ -281,6 +285,10 @@ abstract class HashExpression[E] extends Expression {
       DataTypeMismatch(
         errorSubClass = "HASH_MAP_TYPE",
         messageParameters = Map("functionName" -> toSQLId(prettyName)))
+    } else if (children.exists(child => hasVariantType(child.dataType))) {
+      DataTypeMismatch(
+        errorSubClass = "HASH_VARIANT_TYPE",
+        messageParameters = Map("functionName" -> toSQLId(prettyName)))
     } else {
       TypeCheckResult.TypeCheckSuccess
     }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 4b58755e13ef..ba9521f221fb 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -747,6 +747,17 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite 
with SQLHelper with Quer
     )
   }
 
+  test("hash expressions are prohibited on VariantType elements") {
+    val argument = Literal.create(null, VariantType)
+    val murmur3Hash = new Murmur3Hash(Seq(argument))
+    assert(murmur3Hash.checkInputDataTypes() ==
+      DataTypeMismatch(
+        errorSubClass = "HASH_VARIANT_TYPE",
+        messageParameters = Map("functionName" -> 
toSQLId(murmur3Hash.prettyName))
+      )
+    )
+  }
+
   test("check types for Lag") {
     val lag = Lag(Literal(1), NonFoldableLiteral(10), Literal(null), true)
     assert(lag.checkInputDataTypes() ==


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to