spark git commit: [SPARK-15676][SQL] Disallow Column Names as Partition Columns For Hive Tables

yhuai Mon, 13 Jun 2016 13:30:44 -0700

Repository: spark
Updated Branches:
  refs/heads/master a6a18a457 -> 3b7fb84cf



[SPARK-15676][SQL] Disallow Column Names as Partition Columns For Hive Tables

#### What changes were proposed in this pull request?
When creating a Hive Table (not data source tables), a common error users might 
make is to specify an existing column name as a partition column. Below is what 
Hive returns in this case:
```
hive> CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (data 
string, part string);
FAILED: SemanticException [Error 10035]: Column repeated in partitioning columns
```
Currently, the error we issued is very confusing:
```
org.apache.spark.sql.AnalysisException: 
org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:For 
direct MetaStore DB connections, we don't support retries at the client level.);
```
This PR is to fix the above issue by capturing the usage error in `Parser`.

#### How was this patch tested?
Added a test case to `DDLCommandSuite`

Author: gatorsmile <[email protected]>

Closes #13415 from gatorsmile/partitionColumnsInTableSchema.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b7fb84c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b7fb84c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b7fb84c

Branch: refs/heads/master
Commit: 3b7fb84cf88bcae56713fd56396db537fa18f2e5
Parents: a6a18a4
Author: gatorsmile <[email protected]>
Authored: Mon Jun 13 13:22:46 2016 -0700
Committer: Yin Huai <[email protected]>
Committed: Mon Jun 13 13:22:46 2016 -0700

----------------------------------------------------------------------
 .../spark/sql/execution/SparkSqlParser.scala       | 17 +++++++++++++++++
 .../sql/execution/command/DDLCommandSuite.scala    | 14 ++++++++++++++
 2 files changed, 31 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/3b7fb84c/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 06d8f15..a0508ad 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -903,6 +903,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder 
{
     val properties = 
Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val selectQuery = Option(ctx.query).map(plan)
 
+    // Ensuring whether no duplicate name is used in table definition
+    val colNames = cols.map(_.name)
+    if (colNames.length != colNames.distinct.length) {
+      val duplicateColumns = colNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }
+      throw operationNotAllowed(s"Duplicated column names found in table 
definition of $name: " +
+        duplicateColumns.mkString("[", ",", "]"), ctx)
+    }
+
+    // For Hive tables, partition columns must not be part of the schema
+    val badPartCols = partitionCols.map(_.name).toSet.intersect(colNames.toSet)
+    if (badPartCols.nonEmpty) {
+      throw operationNotAllowed(s"Partition columns may not be specified in 
the schema: " +
+        badPartCols.map("\"" + _ + "\"").mkString("[", ",", "]"), ctx)
+    }
+
     // Note: Hive requires partition columns to be distinct from the schema, 
so we need
     // to include the partition columns here explicitly
     val schema = cols ++ partitionCols

http://git-wip-us.apache.org/repos/asf/spark/blob/3b7fb84c/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index aec7e99..5bee28b 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -334,6 +334,20 @@ class DDLCommandSuite extends PlanTest {
     assert(ct.table.storage.locationUri == Some("/something/anything"))
   }
 
+  test("create table - column repeated in partitioning columns") {
+    val query = "CREATE TABLE tab1 (key INT, value STRING) PARTITIONED BY (key 
INT, hr STRING)"
+    val e = intercept[ParseException] { parser.parsePlan(query) }
+    assert(e.getMessage.contains(
+      "Operation not allowed: Partition columns may not be specified in the 
schema: [\"key\"]"))
+  }
+
+  test("create table - duplicate column names in the table definition") {
+    val query = "CREATE TABLE default.tab1 (key INT, key STRING)"
+    val e = intercept[ParseException] { parser.parsePlan(query) }
+    assert(e.getMessage.contains("Operation not allowed: Duplicated column 
names found in " +
+      "table definition of `default`.`tab1`: [\"key\"]"))
+  }
+
   test("create table using - with partitioned by") {
     val query = "CREATE TABLE my_tab(a INT, b STRING) USING parquet 
PARTITIONED BY (a)"
     val expected = CreateTableUsing(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-15676][SQL] Disallow Column Names as Partition Columns For Hive Tables

Reply via email to