Repository: spark Updated Branches: refs/heads/master 912634b00 -> 2a4dd6f06
[SPARK-24681][SQL] Verify nested column names in Hive metastore ## What changes were proposed in this pull request? This pr added code to check if nested column names do not include ',', ':', and ';' because Hive metastore can't handle these characters in nested column names; ref: https://github.com/apache/hive/blob/release-1.2.1/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java#L239 ## How was this patch tested? Added tests in `HiveDDLSuite`. Author: Takeshi Yamamuro <yamam...@apache.org> Closes #21711 from maropu/SPARK-24681. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a4dd6f0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a4dd6f0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a4dd6f0 Branch: refs/heads/master Commit: 2a4dd6f06cfd2f58fda9786c88809e6de695444e Parents: 912634b Author: Takeshi Yamamuro <yamam...@apache.org> Authored: Tue Jul 17 14:15:30 2018 -0700 Committer: Xiao Li <gatorsm...@gmail.com> Committed: Tue Jul 17 14:15:30 2018 -0700 ---------------------------------------------------------------------- .../spark/sql/hive/HiveExternalCatalog.scala | 34 ++++++++++++++++---- .../spark/sql/hive/execution/HiveDDLSuite.scala | 19 +++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/2a4dd6f0/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 44480ce..7f28fc4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -138,17 +138,37 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat } /** - * Checks the validity of data column names. Hive metastore disallows the table to use comma in - * data column names. Partition columns do not have such a restriction. Views do not have such - * a restriction. + * Checks the validity of data column names. Hive metastore disallows the table to use some + * special characters (',', ':', and ';') in data column names, including nested column names. + * Partition columns do not have such a restriction. Views do not have such a restriction. */ private def verifyDataSchema( tableName: TableIdentifier, tableType: CatalogTableType, dataSchema: StructType): Unit = { if (tableType != VIEW) { - dataSchema.map(_.name).foreach { colName => - if (colName.contains(",")) { - throw new AnalysisException("Cannot create a table having a column whose name contains " + - s"commas in Hive metastore. Table: $tableName; Column: $colName") + val invalidChars = Seq(",", ":", ";") + def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach { f => + f.dataType match { + case st: StructType => verifyNestedColumnNames(st) + case _ if invalidChars.exists(f.name.contains) => + val invalidCharsString = invalidChars.map(c => s"'$c'").mkString(", ") + val errMsg = "Cannot create a table having a nested column whose name contains " + + s"invalid characters ($invalidCharsString) in Hive metastore. Table: $tableName; " + + s"Column: ${f.name}" + throw new AnalysisException(errMsg) + case _ => + } + } + + dataSchema.foreach { f => + f.dataType match { + // Checks top-level column names + case _ if f.name.contains(",") => + throw new AnalysisException("Cannot create a table having a column whose name " + + s"contains commas in Hive metastore. Table: $tableName; Column: ${f.name}") + // Checks nested column names + case st: StructType => + verifyNestedColumnNames(st) + case _ => } } } http://git-wip-us.apache.org/repos/asf/spark/blob/2a4dd6f0/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 0341c3b..31fd4c5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils} +import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET} import org.apache.spark.sql.hive.orc.OrcFileOperator @@ -2248,4 +2249,22 @@ class HiveDDLSuite checkAnswer(spark.table("t4"), Row(0, 0)) } } + + test("SPARK-24681 checks if nested column names do not include ',', ':', and ';'") { + val expectedMsg = "Cannot create a table having a nested column whose name contains invalid " + + "characters (',', ':', ';') in Hive metastore." + + Seq("nested,column", "nested:column", "nested;column").foreach { nestedColumnName => + withTable("t") { + val e = intercept[AnalysisException] { + spark.range(1) + .select(struct(lit(0).as(nestedColumnName)).as("toplevel")) + .write + .format("hive") + .saveAsTable("t") + }.getMessage + assert(e.contains(expectedMsg)) + } + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org