This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 65fe9efa40a3 [SPARK-47101][SQL] Allow comma to be used in top-level column names and remove check nested type definition in `HiveExternalCatalog.verifyDataSchema` 65fe9efa40a3 is described below commit 65fe9efa40a3661f5aa1ec851a0d1b41544b3fb4 Author: Kent Yao <y...@apache.org> AuthorDate: Wed Feb 21 18:23:54 2024 -0800 [SPARK-47101][SQL] Allow comma to be used in top-level column names and remove check nested type definition in `HiveExternalCatalog.verifyDataSchema` ### What changes were proposed in this pull request? > In Hive 0.13 and later, column names can contain any [Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) character (see [HIVE-6013](https://issues.apache.org/jira/browse/HIVE-6013)), however, dot (.) and colon (:) yield errors on querying, so they are disallowed in Hive 1.2.0 (see [HIVE-10120](https://issues.apache.org/jira/browse/HIVE-10120)). Any column name that is specified within backticks (`) is treated literally. Within a backtick string, use double backticks ( [...] According to Hive Doc, the column names have the flexibility to contain any character from the Unicode set. This PR makes HiveExternalCatalog.verifyDataSchema: - Allow comma to be used in top-level column names - remove check invalid characters in nested type definition for hard-coded ",:;", which turns out to be incomplete. for example, "^%", etc., are not allowed. They are all delayed to Hive API calls instead. ### Why are the changes needed? improvement ### Does this PR introduce _any_ user-facing change? yes, some special characters are now allowed and errors for some invalid characters now throw Spark Errors instead of Hive Meta Errors ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #45180 from yaooqinn/SPARK-47101. Authored-by: Kent Yao <y...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../src/main/resources/error/error-classes.json | 6 --- docs/sql-error-conditions.md | 6 --- .../spark/sql/hive/HiveExternalCatalog.scala | 51 +----------------- .../spark/sql/hive/execution/HiveDDLSuite.scala | 60 ++++++++++++---------- .../spark/sql/hive/execution/SQLQuerySuite.scala | 35 ------------- 5 files changed, 33 insertions(+), 125 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index a89f0ea26727..c6149ce35a43 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -2008,12 +2008,6 @@ }, "sqlState" : "HY000" }, - "INVALID_HIVE_COLUMN_NAME" : { - "message" : [ - "Cannot create the table <tableName> having the column <columnName> whose name contains invalid characters <invalidChars> in Hive metastore." - ], - "sqlState" : "42K05" - }, "INVALID_IDENTIFIER" : { "message" : [ "The identifier <ident> is invalid. Please, consider quoting it with back-quotes as `<ident>`." diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index ebf7436f96e7..e458cd5a337b 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -1190,12 +1190,6 @@ The handle `<handle>` is invalid. For more details see [INVALID_HANDLE](sql-error-conditions-invalid-handle-error-class.html) -### INVALID_HIVE_COLUMN_NAME - -[SQLSTATE: 42K05](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) - -Cannot create the table `<tableName>` having the column `<columnName>` whose name contains invalid characters `<invalidChars>` in Hive metastore. - ### INVALID_IDENTIFIER [SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 84f75e3ef503..32fc8a452106 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -42,13 +42,12 @@ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils} -import org.apache.spark.sql.catalyst.util.TypeUtils.{toSQLId, toSQLValue} import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions} import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.internal.StaticSQLConf._ -import org.apache.spark.sql.types.{AnsiIntervalType, ArrayType, DataType, MapType, StructType, TimestampNTZType} +import org.apache.spark.sql.types._ /** * A persistent implementation of the system catalog using Hive. @@ -150,51 +149,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat } } - /** - * Checks the validity of data column names. Hive metastore disallows the table to use some - * special characters (',', ':', and ';') in data column names, including nested column names. - * Partition columns do not have such a restriction. Views do not have such a restriction. - */ - private def verifyDataSchema( - tableName: TableIdentifier, tableType: CatalogTableType, dataSchema: StructType): Unit = { - if (tableType != VIEW) { - val invalidChars = Seq(",", ":", ";") - def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach { f => - f.dataType match { - case st: StructType => verifyNestedColumnNames(st) - case _ if invalidChars.exists(f.name.contains) => - val invalidCharsString = invalidChars.map(c => s"'$c'").mkString(", ") - throw new AnalysisException( - errorClass = "INVALID_HIVE_COLUMN_NAME", - messageParameters = Map( - "invalidChars" -> invalidCharsString, - "tableName" -> toSQLId(tableName.nameParts), - "columnName" -> toSQLId(f.name) - )) - case _ => - } - } - - dataSchema.foreach { f => - f.dataType match { - // Checks top-level column names - case _ if f.name.contains(",") => - throw new AnalysisException( - errorClass = "INVALID_HIVE_COLUMN_NAME", - messageParameters = Map( - "invalidChars" -> toSQLValue(","), - "tableName" -> toSQLId(tableName.nameParts), - "columnName" -> toSQLId(f.name) - )) - // Checks nested column names - case st: StructType => - verifyNestedColumnNames(st) - case _ => - } - } - } - } - // -------------------------------------------------------------------------- // Databases // -------------------------------------------------------------------------- @@ -260,8 +214,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val table = tableDefinition.identifier.table requireDbExists(db) verifyTableProperties(tableDefinition) - verifyDataSchema( - tableDefinition.identifier, tableDefinition.tableType, tableDefinition.dataSchema) if (tableExists(db, table) && !ignoreIfExists) { throw new TableAlreadyExistsException(db = db, table = table) @@ -711,7 +663,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat newDataSchema: StructType): Unit = withClient { requireTableExists(db, table) val oldTable = getTable(db, table) - verifyDataSchema(oldTable.identifier, oldTable.tableType, newDataSchema) val schemaProps = tableMetaToTableProps(oldTable, StructType(newDataSchema ++ oldTable.partitionSchema)).toMap diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 15a7796a72b5..7b0e0b28780c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -34,10 +34,10 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER +import org.apache.spark.sql.errors.DataTypeErrors.toSQLType import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils} import org.apache.spark.sql.execution.datasources.orc.OrcCompressionCodec import org.apache.spark.sql.execution.datasources.parquet.{ParquetCompressionCodec, ParquetFooterReader} -import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET} import org.apache.spark.sql.hive.orc.OrcFileOperator @@ -2876,22 +2876,39 @@ class HiveDDLSuite } } - test("SPARK-24681 checks if nested column names do not include ',', ':', and ';'") { - Seq("nested,column", "nested:column", "nested;column").foreach { nestedColumnName => + test("SPARK-47101 checks if nested column names do not include invalid characters") { + // delimiter characters + Seq(",", ":").foreach { c => + val typ = s"array<struct<`abc${c}xyz`:int>>" + // The regex is from HiveClientImpl.getSparkSQLDataType, please keep them in sync. + val replaced = typ.replaceAll("`", "").replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`") + withTable("t") { + checkError( + exception = intercept[SparkException] { + sql(s"CREATE TABLE t (a $typ) USING hive") + }, + errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE", + parameters = Map( + "fieldType" -> toSQLType(replaced), + "fieldName" -> "`a`") + ) + } + } + // other special characters + Seq(";", "^", "\\", "/", "%").foreach { c => + val typ = s"array<struct<`abc${c}xyz`:int>>" + val replaced = typ.replaceAll("`", "") + val msg = s"java.lang.IllegalArgumentException: Error: : expected at the position " + + s"16 of '$replaced' but '$c' is found." withTable("t") { checkError( exception = intercept[AnalysisException] { - spark.range(1) - .select(struct(lit(0).as(nestedColumnName)).as("toplevel")) - .write - .format("hive") - .saveAsTable("t") + sql(s"CREATE TABLE t (a $typ) USING hive") }, - errorClass = "INVALID_HIVE_COLUMN_NAME", + errorClass = "_LEGACY_ERROR_TEMP_3065", parameters = Map( - "invalidChars" -> "',', ':', ';'", - "tableName" -> "`spark_catalog`.`default`.`t`", - "columnName" -> s"`$nestedColumnName`") + "clazz" -> "org.apache.hadoop.hive.ql.metadata.HiveException", + "msg" -> msg) ) } } @@ -3385,24 +3402,11 @@ class HiveDDLSuite } } - test("SPARK-44911: Create the table with invalid column") { + test("SPARK-47101: comma is allowed in column name") { val tbl = "t1" withTable(tbl) { - val e = intercept[AnalysisException] { - sql( - s""" - |CREATE TABLE t1 - |STORED AS parquet - |SELECT id, DATE'2018-01-01' + MAKE_DT_INTERVAL(0, id) FROM RANGE(0, 10) - """.stripMargin) - } - checkError(e, - errorClass = "INVALID_HIVE_COLUMN_NAME", - parameters = Map( - "invalidChars" -> "','", - "tableName" -> "`spark_catalog`.`default`.`t1`", - "columnName" -> "`DATE '2018-01-01' + make_dt_interval(0, id, 0, 0`.`000000)`") - ) + sql("CREATE TABLE t1 STORED AS parquet SELECT id as `a,b` FROM range(1)") + checkAnswer(sql("SELECT * FROM t1"), Row(0)) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 1a0c2f6165a4..0bcac639443c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2141,41 +2141,6 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } } - test("Auto alias construction of get_json_object") { - val df = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", "jstring") - - withTable("t") { - val e = intercept[AnalysisException] { - df.select($"key", functions.get_json_object($"jstring", "$.f1")) - .write.format("hive").saveAsTable("t") - } - checkError(e, - errorClass = "INVALID_HIVE_COLUMN_NAME", - parameters = Map( - "invalidChars" -> "','", - "tableName" -> "`spark_catalog`.`default`.`t`", - "columnName" -> "`get_json_object(jstring, $`.`f1)`") - ) - } - - withTempView("tempView") { - withTable("t") { - df.createTempView("tempView") - val e = intercept[AnalysisException] { - sql("CREATE TABLE t USING hive AS " + - "SELECT key, get_json_object(jstring, '$.f1') FROM tempView") - } - checkError(e, - errorClass = "INVALID_HIVE_COLUMN_NAME", - parameters = Map( - "invalidChars" -> "','", - "tableName" -> "`spark_catalog`.`default`.`t`", - "columnName" -> "`get_json_object(jstring, $`.`f1)`") - ) - } - } - } - test("SPARK-19912 String literals should be escaped for Hive metastore partition pruning") { withTable("spark_19912") { Seq( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org