This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 65fe9efa40a3 [SPARK-47101][SQL] Allow comma to be used in top-level
column names and remove check nested type definition in
`HiveExternalCatalog.verifyDataSchema`
65fe9efa40a3 is described below
commit 65fe9efa40a3661f5aa1ec851a0d1b41544b3fb4
Author: Kent Yao <[email protected]>
AuthorDate: Wed Feb 21 18:23:54 2024 -0800
[SPARK-47101][SQL] Allow comma to be used in top-level column names and
remove check nested type definition in `HiveExternalCatalog.verifyDataSchema`
### What changes were proposed in this pull request?
> In Hive 0.13 and later, column names can contain any
[Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) character
(see [HIVE-6013](https://issues.apache.org/jira/browse/HIVE-6013)), however,
dot (.) and colon (:) yield errors on querying, so they are disallowed in Hive
1.2.0 (see [HIVE-10120](https://issues.apache.org/jira/browse/HIVE-10120)). Any
column name that is specified within backticks (`) is treated literally. Within
a backtick string, use double backticks ( [...]
According to Hive Doc, the column names have the flexibility to contain any
character from the Unicode set.
This PR makes HiveExternalCatalog.verifyDataSchema:
- Allow comma to be used in top-level column names
- remove check invalid characters in nested type definition for hard-coded
",:;", which turns out to be incomplete. for example, "^%", etc., are not
allowed. They are all delayed to Hive API calls instead.
### Why are the changes needed?
improvement
### Does this PR introduce _any_ user-facing change?
yes, some special characters are now allowed and errors for some invalid
characters now throw Spark Errors instead of Hive Meta Errors
### How was this patch tested?
new tests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #45180 from yaooqinn/SPARK-47101.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../src/main/resources/error/error-classes.json | 6 ---
docs/sql-error-conditions.md | 6 ---
.../spark/sql/hive/HiveExternalCatalog.scala | 51 +-----------------
.../spark/sql/hive/execution/HiveDDLSuite.scala | 60 ++++++++++++----------
.../spark/sql/hive/execution/SQLQuerySuite.scala | 35 -------------
5 files changed, 33 insertions(+), 125 deletions(-)
diff --git a/common/utils/src/main/resources/error/error-classes.json
b/common/utils/src/main/resources/error/error-classes.json
index a89f0ea26727..c6149ce35a43 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -2008,12 +2008,6 @@
},
"sqlState" : "HY000"
},
- "INVALID_HIVE_COLUMN_NAME" : {
- "message" : [
- "Cannot create the table <tableName> having the column <columnName>
whose name contains invalid characters <invalidChars> in Hive metastore."
- ],
- "sqlState" : "42K05"
- },
"INVALID_IDENTIFIER" : {
"message" : [
"The identifier <ident> is invalid. Please, consider quoting it with
back-quotes as `<ident>`."
diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
index ebf7436f96e7..e458cd5a337b 100644
--- a/docs/sql-error-conditions.md
+++ b/docs/sql-error-conditions.md
@@ -1190,12 +1190,6 @@ The handle `<handle>` is invalid.
For more details see
[INVALID_HANDLE](sql-error-conditions-invalid-handle-error-class.html)
-### INVALID_HIVE_COLUMN_NAME
-
-[SQLSTATE:
42K05](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
-
-Cannot create the table `<tableName>` having the column `<columnName>` whose
name contains invalid characters `<invalidChars>` in Hive metastore.
-
### INVALID_IDENTIFIER
[SQLSTATE:
42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
diff --git
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 84f75e3ef503..32fc8a452106 100644
---
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -42,13 +42,12 @@ import
org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.types.DataTypeUtils
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap,
CharVarcharUtils}
-import org.apache.spark.sql.catalyst.util.TypeUtils.{toSQLId, toSQLValue}
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.execution.datasources.{PartitioningUtils,
SourceOptions}
import org.apache.spark.sql.hive.client.HiveClient
import org.apache.spark.sql.internal.HiveSerDe
import org.apache.spark.sql.internal.StaticSQLConf._
-import org.apache.spark.sql.types.{AnsiIntervalType, ArrayType, DataType,
MapType, StructType, TimestampNTZType}
+import org.apache.spark.sql.types._
/**
* A persistent implementation of the system catalog using Hive.
@@ -150,51 +149,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf,
hadoopConf: Configurat
}
}
- /**
- * Checks the validity of data column names. Hive metastore disallows the
table to use some
- * special characters (',', ':', and ';') in data column names, including
nested column names.
- * Partition columns do not have such a restriction. Views do not have such
a restriction.
- */
- private def verifyDataSchema(
- tableName: TableIdentifier, tableType: CatalogTableType, dataSchema:
StructType): Unit = {
- if (tableType != VIEW) {
- val invalidChars = Seq(",", ":", ";")
- def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach {
f =>
- f.dataType match {
- case st: StructType => verifyNestedColumnNames(st)
- case _ if invalidChars.exists(f.name.contains) =>
- val invalidCharsString = invalidChars.map(c =>
s"'$c'").mkString(", ")
- throw new AnalysisException(
- errorClass = "INVALID_HIVE_COLUMN_NAME",
- messageParameters = Map(
- "invalidChars" -> invalidCharsString,
- "tableName" -> toSQLId(tableName.nameParts),
- "columnName" -> toSQLId(f.name)
- ))
- case _ =>
- }
- }
-
- dataSchema.foreach { f =>
- f.dataType match {
- // Checks top-level column names
- case _ if f.name.contains(",") =>
- throw new AnalysisException(
- errorClass = "INVALID_HIVE_COLUMN_NAME",
- messageParameters = Map(
- "invalidChars" -> toSQLValue(","),
- "tableName" -> toSQLId(tableName.nameParts),
- "columnName" -> toSQLId(f.name)
- ))
- // Checks nested column names
- case st: StructType =>
- verifyNestedColumnNames(st)
- case _ =>
- }
- }
- }
- }
-
// --------------------------------------------------------------------------
// Databases
// --------------------------------------------------------------------------
@@ -260,8 +214,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf,
hadoopConf: Configurat
val table = tableDefinition.identifier.table
requireDbExists(db)
verifyTableProperties(tableDefinition)
- verifyDataSchema(
- tableDefinition.identifier, tableDefinition.tableType,
tableDefinition.dataSchema)
if (tableExists(db, table) && !ignoreIfExists) {
throw new TableAlreadyExistsException(db = db, table = table)
@@ -711,7 +663,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf,
hadoopConf: Configurat
newDataSchema: StructType): Unit = withClient {
requireTableExists(db, table)
val oldTable = getTable(db, table)
- verifyDataSchema(oldTable.identifier, oldTable.tableType, newDataSchema)
val schemaProps =
tableMetaToTableProps(oldTable, StructType(newDataSchema ++
oldTable.partitionSchema)).toMap
diff --git
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 15a7796a72b5..7b0e0b28780c 100644
---
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -34,10 +34,10 @@ import
org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
import org.apache.spark.sql.connector.catalog.CatalogManager
import
org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER
+import org.apache.spark.sql.errors.DataTypeErrors.toSQLType
import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
import org.apache.spark.sql.execution.datasources.orc.OrcCompressionCodec
import
org.apache.spark.sql.execution.datasources.parquet.{ParquetCompressionCodec,
ParquetFooterReader}
-import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC,
CONVERT_METASTORE_PARQUET}
import org.apache.spark.sql.hive.orc.OrcFileOperator
@@ -2876,22 +2876,39 @@ class HiveDDLSuite
}
}
- test("SPARK-24681 checks if nested column names do not include ',', ':', and
';'") {
- Seq("nested,column", "nested:column", "nested;column").foreach {
nestedColumnName =>
+ test("SPARK-47101 checks if nested column names do not include invalid
characters") {
+ // delimiter characters
+ Seq(",", ":").foreach { c =>
+ val typ = s"array<struct<`abc${c}xyz`:int>>"
+ // The regex is from HiveClientImpl.getSparkSQLDataType, please keep
them in sync.
+ val replaced = typ.replaceAll("`",
"").replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`")
+ withTable("t") {
+ checkError(
+ exception = intercept[SparkException] {
+ sql(s"CREATE TABLE t (a $typ) USING hive")
+ },
+ errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE",
+ parameters = Map(
+ "fieldType" -> toSQLType(replaced),
+ "fieldName" -> "`a`")
+ )
+ }
+ }
+ // other special characters
+ Seq(";", "^", "\\", "/", "%").foreach { c =>
+ val typ = s"array<struct<`abc${c}xyz`:int>>"
+ val replaced = typ.replaceAll("`", "")
+ val msg = s"java.lang.IllegalArgumentException: Error: : expected at the
position " +
+ s"16 of '$replaced' but '$c' is found."
withTable("t") {
checkError(
exception = intercept[AnalysisException] {
- spark.range(1)
- .select(struct(lit(0).as(nestedColumnName)).as("toplevel"))
- .write
- .format("hive")
- .saveAsTable("t")
+ sql(s"CREATE TABLE t (a $typ) USING hive")
},
- errorClass = "INVALID_HIVE_COLUMN_NAME",
+ errorClass = "_LEGACY_ERROR_TEMP_3065",
parameters = Map(
- "invalidChars" -> "',', ':', ';'",
- "tableName" -> "`spark_catalog`.`default`.`t`",
- "columnName" -> s"`$nestedColumnName`")
+ "clazz" -> "org.apache.hadoop.hive.ql.metadata.HiveException",
+ "msg" -> msg)
)
}
}
@@ -3385,24 +3402,11 @@ class HiveDDLSuite
}
}
- test("SPARK-44911: Create the table with invalid column") {
+ test("SPARK-47101: comma is allowed in column name") {
val tbl = "t1"
withTable(tbl) {
- val e = intercept[AnalysisException] {
- sql(
- s"""
- |CREATE TABLE t1
- |STORED AS parquet
- |SELECT id, DATE'2018-01-01' + MAKE_DT_INTERVAL(0, id) FROM
RANGE(0, 10)
- """.stripMargin)
- }
- checkError(e,
- errorClass = "INVALID_HIVE_COLUMN_NAME",
- parameters = Map(
- "invalidChars" -> "','",
- "tableName" -> "`spark_catalog`.`default`.`t1`",
- "columnName" -> "`DATE '2018-01-01' + make_dt_interval(0, id, 0,
0`.`000000)`")
- )
+ sql("CREATE TABLE t1 STORED AS parquet SELECT id as `a,b` FROM range(1)")
+ checkAnswer(sql("SELECT * FROM t1"), Row(0))
}
}
}
diff --git
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 1a0c2f6165a4..0bcac639443c 100644
---
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2141,41 +2141,6 @@ abstract class SQLQuerySuiteBase extends QueryTest with
SQLTestUtils with TestHi
}
}
- test("Auto alias construction of get_json_object") {
- val df = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key",
"jstring")
-
- withTable("t") {
- val e = intercept[AnalysisException] {
- df.select($"key", functions.get_json_object($"jstring", "$.f1"))
- .write.format("hive").saveAsTable("t")
- }
- checkError(e,
- errorClass = "INVALID_HIVE_COLUMN_NAME",
- parameters = Map(
- "invalidChars" -> "','",
- "tableName" -> "`spark_catalog`.`default`.`t`",
- "columnName" -> "`get_json_object(jstring, $`.`f1)`")
- )
- }
-
- withTempView("tempView") {
- withTable("t") {
- df.createTempView("tempView")
- val e = intercept[AnalysisException] {
- sql("CREATE TABLE t USING hive AS " +
- "SELECT key, get_json_object(jstring, '$.f1') FROM tempView")
- }
- checkError(e,
- errorClass = "INVALID_HIVE_COLUMN_NAME",
- parameters = Map(
- "invalidChars" -> "','",
- "tableName" -> "`spark_catalog`.`default`.`t`",
- "columnName" -> "`get_json_object(jstring, $`.`f1)`")
- )
- }
- }
- }
-
test("SPARK-19912 String literals should be escaped for Hive metastore
partition pruning") {
withTable("spark_19912") {
Seq(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]