MaxGekk commented on code in PR #56850:
URL: https://github.com/apache/spark/pull/56850#discussion_r3493017841
##########
sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala:
##########
@@ -683,6 +683,56 @@ class InsertSuite extends QueryTest with TestHiveSingleton
with BeforeAndAfter {
}
}
+ test("SPARK-57556: TIME type is unsupported when writing to a Hive serde
directory") {
+ // Disable native data source conversion so that the write goes through
the Hive serde
+ // path (HiveFileFormat) instead of a native data source that may support
TIME.
+ withSQLConf(HiveUtils.CONVERT_METASTORE_INSERT_DIR.key -> "false") {
+ withTempDir { dir =>
+ // InsertIntoHiveDirCommand wraps the failure in a SparkException, so
assert on the cause.
+ val e = intercept[SparkException] {
+ sql(
+ s"""
+ |INSERT OVERWRITE LOCAL DIRECTORY '${dir.toURI.getPath}'
+ |STORED AS PARQUET
+ |SELECT TIME'12:01:02' AS c
+ """.stripMargin)
+ }
+ checkError(
+ exception = e.getCause.asInstanceOf[AnalysisException],
+ condition = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE",
+ parameters = Map(
+ "columnName" -> "`c`",
+ "columnType" -> s"\"${TimeType().sql}\"",
+ "format" -> "Hive"))
+ }
+ }
+ }
+
+ test("SPARK-57556: nested TIME type is unsupported when writing to a Hive
serde directory") {
+ // Exercises HiveFileFormat.supportDataType's recursion into nested types:
a TIME nested inside
+ // an array must also be rejected, with the full (array) column type
reported.
+ withSQLConf(HiveUtils.CONVERT_METASTORE_INSERT_DIR.key -> "false") {
+ withTempDir { dir =>
+ // InsertIntoHiveDirCommand wraps the failure in a SparkException, so
assert on the cause.
+ val e = intercept[SparkException] {
+ sql(
+ s"""
+ |INSERT OVERWRITE LOCAL DIRECTORY '${dir.toURI.getPath}'
+ |STORED AS PARQUET
+ |SELECT array(TIME'12:01:02') AS c
+ """.stripMargin)
+ }
+ checkError(
+ exception = e.getCause.asInstanceOf[AnalysisException],
+ condition = "UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE",
+ parameters = Map(
+ "columnName" -> "`c`",
+ "columnType" -> s"\"${ArrayType(TimeType()).sql}\"",
+ "format" -> "Hive"))
+ }
+ }
+ }
+
Review Comment:
Good idea, but it turns out a Hive serde table cannot hold a TIME column in
the first place, so a managed-table `INSERT INTO` never reaches the new
write-path check. `CREATE TABLE t (c TIME) STORED AS PARQUET` fails at
metastore creation:
```
AnalysisException: HiveException: IllegalArgumentException:
Error: type expected at the position 0 of 'time(6)' but 'time' is found.
at HiveExternalCatalog.createTable ... at CreateTableCommand.run
```
`HiveClientImpl.toHiveColumn` emits the catalog string `time(6)` as the Hive
type, which Hive cannot parse. So the `INSERT OVERWRITE DIRECTORY ... STORED
AS` cases (scalar + nested) are the ones that exercise
`HiveFileFormat.supportDataType` directly; I left this as-is.
##########
sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala:
##########
@@ -407,6 +408,19 @@ class HiveUDFSuite extends QueryTest with
TestHiveSingleton {
}
}
+ test("SPARK-57556: TIME type is unsupported as a Hive UDF argument") {
+ withUserDefinedFunction("testGenericUDFHash" -> true) {
+ sql(s"CREATE TEMPORARY FUNCTION testGenericUDFHash AS
'${classOf[GenericUDFHash].getName}'")
+ // The Hive UDF resolver wraps the underlying failure, but the message
must still clearly
+ // identify the unsupported TIME type rather than surfacing a
MatchError/internal error.
+ val e = intercept[AnalysisException] {
+ sql("SELECT testGenericUDFHash(TIME'12:01:02')").collect()
+ }
+ assert(e.getMessage.contains("UNSUPPORTED_DATATYPE"))
+ assert(e.getMessage.contains(TimeType().sql))
Review Comment:
I looked into this but `checkError` does not cleanly apply here. The Hive
UDF resolver in `HiveSessionStateBuilder.makeFunctionExpression` catches the
inner `UNSUPPORTED_DATATYPE` and re-wraps it into a new
`_LEGACY_ERROR_TEMP_3084` `AnalysisException`, capturing the original only as
the `e` string parameter (`e.toString`) and copying its stack trace -- it does
not attach it as a cause (the 2-arg `AnalysisException(errorClass,
messageParameters)` constructor sets `cause = None`). So `getCause` is null and
there is no inner condition to assert against; the only structured option would
be `checkError` on `_LEGACY_ERROR_TEMP_3084` with a brittle full-message `e`
param.
I kept `assert(...contains...)` and filed SPARK-57750 (sub-task of
SPARK-37935) to give that legacy error a proper name and attach the cause,
after which this can become a clean nested `checkError`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]