This is an automated email from the ASF dual-hosted git repository. yao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ad5fcae0b0ed [SPARK-48271][SQL] Turn match error in RowEncoder into UNSUPPORTED_DATA_TYPE_FOR_ENCODER ad5fcae0b0ed is described below commit ad5fcae0b0ed41f7e97ab419b32068e5adf71064 Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Wed May 15 16:45:20 2024 +0800 [SPARK-48271][SQL] Turn match error in RowEncoder into UNSUPPORTED_DATA_TYPE_FOR_ENCODER ### What changes were proposed in this pull request? Today we can't create `RowEncoder` with char/varchar data type, because we believe this can't happen. Spark will turn char/varchar into string type in leaf nodes. However, advanced users can even create custom logical plans and it's hard to guarantee no char/varchar data type in the entire query plan tree. UDF return type can also be char/varchar. This PR adds UNSUPPORTED_DATA_TYPE_FOR_ENCODER instead of throwing scala match error. ### Why are the changes needed? better error ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #46586 from cloud-fan/error. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Kent Yao <y...@apache.org> --- .../src/main/resources/error/error-conditions.json | 6 ++++++ .../apache/spark/sql/catalyst/encoders/RowEncoder.scala | 12 +++++++++--- .../src/test/scala/org/apache/spark/sql/UDFSuite.scala | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 730999085de9..75067a1920f7 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4207,6 +4207,12 @@ ], "sqlState" : "0A000" }, + "UNSUPPORTED_DATA_TYPE_FOR_ENCODER" : { + "message" : [ + "Cannot create encoder for <dataType>. Please use a different output data type for your UDF or DataFrame." + ], + "sqlState" : "0A000" + }, "UNSUPPORTED_DEFAULT_VALUE" : { "message" : [ "DEFAULT column values is not supported." diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala index 16ac283eccb1..c507e952630f 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.catalyst.encoders import scala.collection.mutable import scala.reflect.classTag -import org.apache.spark.sql.Row +import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLongEncoder, BoxedShortEncoder, CalendarIntervalEncoder, DateEncoder, DayTimeIntervalEncoder, EncoderField, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, NullEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, TimestampEncoder, UDTEncoder, VariantE [...] -import org.apache.spark.sql.errors.ExecutionErrors +import org.apache.spark.sql.errors.{DataTypeErrorsBase, ExecutionErrors} import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.types._ import org.apache.spark.util.ArrayImplicits._ @@ -59,7 +59,7 @@ import org.apache.spark.util.ArrayImplicits._ * StructType -> org.apache.spark.sql.Row * }}} */ -object RowEncoder { +object RowEncoder extends DataTypeErrorsBase { def encoderFor(schema: StructType): AgnosticEncoder[Row] = { encoderFor(schema, lenient = false) } @@ -124,5 +124,11 @@ object RowEncoder { field.nullable, field.metadata) }.toImmutableArraySeq) + + case _ => + throw new AnalysisException( + errorClass = "UNSUPPORTED_DATA_TYPE_FOR_ENCODER", + messageParameters = Map("dataType" -> toSQLType(dataType)) + ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index fe47d6c68555..32ad5a94984b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -1194,4 +1194,21 @@ class UDFSuite extends QueryTest with SharedSparkSession { .select(f(struct(ds2("value").as("_1")))), Row(Row(null))) } + + test("char/varchar as UDF return type") { + Seq(CharType(5), VarcharType(5)).foreach { dt => + val f = udf( + new UDF0[String] { + override def call(): String = "a" + }, + dt + ) + checkError( + intercept[AnalysisException](spark.range(1).select(f())), + errorClass = "UNSUPPORTED_DATA_TYPE_FOR_ENCODER", + sqlState = "0A000", + parameters = Map("dataType" -> s"\"${dt.sql}\"") + ) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org