(spark) branch master updated: [SPARK-48271][SQL] Turn match error in RowEncoder into UNSUPPORTED_DATA_TYPE_FOR_ENCODER

yao Wed, 15 May 2024 01:46:33 -0700

This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new ad5fcae0b0ed [SPARK-48271][SQL] Turn match error in RowEncoder into 
UNSUPPORTED_DATA_TYPE_FOR_ENCODER
ad5fcae0b0ed is described below

commit ad5fcae0b0ed41f7e97ab419b32068e5adf71064
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Wed May 15 16:45:20 2024 +0800

    [SPARK-48271][SQL] Turn match error in RowEncoder into 
UNSUPPORTED_DATA_TYPE_FOR_ENCODER
    
    ### What changes were proposed in this pull request?
    
    Today we can't create `RowEncoder` with char/varchar data type, because we 
believe this can't happen. Spark will turn char/varchar into string type in 
leaf nodes. However, advanced users can even create custom logical plans and 
it's hard to guarantee no char/varchar data type in the entire query plan tree. 
UDF return type can also be char/varchar.
    
    This PR adds UNSUPPORTED_DATA_TYPE_FOR_ENCODER instead of throwing scala 
match error.
    
    ### Why are the changes needed?
    
    better error
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    new test
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #46586 from cloud-fan/error.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Kent Yao <y...@apache.org>
---
 .../src/main/resources/error/error-conditions.json      |  6 ++++++
 .../apache/spark/sql/catalyst/encoders/RowEncoder.scala | 12 +++++++++---
 .../src/test/scala/org/apache/spark/sql/UDFSuite.scala  | 17 +++++++++++++++++
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-conditions.json 
b/common/utils/src/main/resources/error/error-conditions.json
index 730999085de9..75067a1920f7 100644
--- a/common/utils/src/main/resources/error/error-conditions.json
+++ b/common/utils/src/main/resources/error/error-conditions.json
@@ -4207,6 +4207,12 @@
     ],
     "sqlState" : "0A000"
   },
+  "UNSUPPORTED_DATA_TYPE_FOR_ENCODER" : {
+    "message" : [
+      "Cannot create encoder for <dataType>. Please use a different output 
data type for your UDF or DataFrame."
+    ],
+    "sqlState" : "0A000"
+  },
   "UNSUPPORTED_DEFAULT_VALUE" : {
     "message" : [
       "DEFAULT column values is not supported."
diff --git 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index 16ac283eccb1..c507e952630f 100644
--- 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.catalyst.encoders
 import scala.collection.mutable
 import scala.reflect.classTag
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, 
BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, 
BoxedIntEncoder, BoxedLongEncoder, BoxedShortEncoder, CalendarIntervalEncoder, 
DateEncoder, DayTimeIntervalEncoder, EncoderField, InstantEncoder, 
IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, LocalDateTimeEncoder, 
MapEncoder, NullEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, 
TimestampEncoder, UDTEncoder, VariantE [...]
-import org.apache.spark.sql.errors.ExecutionErrors
+import org.apache.spark.sql.errors.{DataTypeErrorsBase, ExecutionErrors}
 import org.apache.spark.sql.internal.SqlApiConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.ArrayImplicits._
@@ -59,7 +59,7 @@ import org.apache.spark.util.ArrayImplicits._
  *   StructType -> org.apache.spark.sql.Row
  * }}}
  */
-object RowEncoder {
+object RowEncoder extends DataTypeErrorsBase {
   def encoderFor(schema: StructType): AgnosticEncoder[Row] = {
     encoderFor(schema, lenient = false)
   }
@@ -124,5 +124,11 @@ object RowEncoder {
           field.nullable,
           field.metadata)
       }.toImmutableArraySeq)
+
+    case _ =>
+      throw new AnalysisException(
+        errorClass = "UNSUPPORTED_DATA_TYPE_FOR_ENCODER",
+        messageParameters = Map("dataType" -> toSQLType(dataType))
+    )
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index fe47d6c68555..32ad5a94984b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -1194,4 +1194,21 @@ class UDFSuite extends QueryTest with SharedSparkSession 
{
         .select(f(struct(ds2("value").as("_1")))),
       Row(Row(null)))
   }
+
+  test("char/varchar as UDF return type") {
+    Seq(CharType(5), VarcharType(5)).foreach { dt =>
+      val f = udf(
+        new UDF0[String] {
+          override def call(): String = "a"
+        },
+        dt
+      )
+      checkError(
+        intercept[AnalysisException](spark.range(1).select(f())),
+        errorClass = "UNSUPPORTED_DATA_TYPE_FOR_ENCODER",
+        sqlState = "0A000",
+        parameters = Map("dataType" -> s"\"${dt.sql}\"")
+      )
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-48271][SQL] Turn match error in RowEncoder into UNSUPPORTED_DATA_TYPE_FOR_ENCODER

Reply via email to