Repository: spark Updated Branches: refs/heads/master a6a18a457 -> 3b7fb84cf
[SPARK-15676][SQL] Disallow Column Names as Partition Columns For Hive Tables #### What changes were proposed in this pull request? When creating a Hive Table (not data source tables), a common error users might make is to specify an existing column name as a partition column. Below is what Hive returns in this case: ``` hive> CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (data string, part string); FAILED: SemanticException [Error 10035]: Column repeated in partitioning columns ``` Currently, the error we issued is very confusing: ``` org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:For direct MetaStore DB connections, we don't support retries at the client level.); ``` This PR is to fix the above issue by capturing the usage error in `Parser`. #### How was this patch tested? Added a test case to `DDLCommandSuite` Author: gatorsmile <[email protected]> Closes #13415 from gatorsmile/partitionColumnsInTableSchema. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b7fb84c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b7fb84c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b7fb84c Branch: refs/heads/master Commit: 3b7fb84cf88bcae56713fd56396db537fa18f2e5 Parents: a6a18a4 Author: gatorsmile <[email protected]> Authored: Mon Jun 13 13:22:46 2016 -0700 Committer: Yin Huai <[email protected]> Committed: Mon Jun 13 13:22:46 2016 -0700 ---------------------------------------------------------------------- .../spark/sql/execution/SparkSqlParser.scala | 17 +++++++++++++++++ .../sql/execution/command/DDLCommandSuite.scala | 14 ++++++++++++++ 2 files changed, 31 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/3b7fb84c/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 06d8f15..a0508ad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -903,6 +903,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { val properties = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty) val selectQuery = Option(ctx.query).map(plan) + // Ensuring whether no duplicate name is used in table definition + val colNames = cols.map(_.name) + if (colNames.length != colNames.distinct.length) { + val duplicateColumns = colNames.groupBy(identity).collect { + case (x, ys) if ys.length > 1 => "\"" + x + "\"" + } + throw operationNotAllowed(s"Duplicated column names found in table definition of $name: " + + duplicateColumns.mkString("[", ",", "]"), ctx) + } + + // For Hive tables, partition columns must not be part of the schema + val badPartCols = partitionCols.map(_.name).toSet.intersect(colNames.toSet) + if (badPartCols.nonEmpty) { + throw operationNotAllowed(s"Partition columns may not be specified in the schema: " + + badPartCols.map("\"" + _ + "\"").mkString("[", ",", "]"), ctx) + } + // Note: Hive requires partition columns to be distinct from the schema, so we need // to include the partition columns here explicitly val schema = cols ++ partitionCols http://git-wip-us.apache.org/repos/asf/spark/blob/3b7fb84c/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index aec7e99..5bee28b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -334,6 +334,20 @@ class DDLCommandSuite extends PlanTest { assert(ct.table.storage.locationUri == Some("/something/anything")) } + test("create table - column repeated in partitioning columns") { + val query = "CREATE TABLE tab1 (key INT, value STRING) PARTITIONED BY (key INT, hr STRING)" + val e = intercept[ParseException] { parser.parsePlan(query) } + assert(e.getMessage.contains( + "Operation not allowed: Partition columns may not be specified in the schema: [\"key\"]")) + } + + test("create table - duplicate column names in the table definition") { + val query = "CREATE TABLE default.tab1 (key INT, key STRING)" + val e = intercept[ParseException] { parser.parsePlan(query) } + assert(e.getMessage.contains("Operation not allowed: Duplicated column names found in " + + "table definition of `default`.`tab1`: [\"key\"]")) + } + test("create table using - with partitioned by") { val query = "CREATE TABLE my_tab(a INT, b STRING) USING parquet PARTITIONED BY (a)" val expected = CreateTableUsing( --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
