spark git commit: [SPARK-17192][SQL] Issue Exception when Users Specify the Partitioning Columns without a Given Schema

yhuai Fri, 26 Aug 2016 11:13:55 -0700

Repository: spark
Updated Branches:
  refs/heads/master 188321623 -> fd4ba3f62



[SPARK-17192][SQL] Issue Exception when Users Specify the Partitioning Columns 
without a Given Schema

### What changes were proposed in this pull request?
Address the comments by yhuai in the original PR: 
https://github.com/apache/spark/pull/14207

First, issue an exception instead of logging a warning when users specify the 
partitioning columns without a given schema.

Second, refactor the codes a little.

### How was this patch tested?
Fixed the test cases.

Author: gatorsmile <gatorsm...@gmail.com>

Closes #14572 from gatorsmile/followup16552.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd4ba3f6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd4ba3f6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd4ba3f6

Branch: refs/heads/master
Commit: fd4ba3f626f49d7d616a2a334d45b1c736e1db1c
Parents: 1883216
Author: gatorsmile <gatorsm...@gmail.com>
Authored: Fri Aug 26 11:13:38 2016 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Fri Aug 26 11:13:38 2016 -0700

----------------------------------------------------------------------
 .../spark/sql/execution/datasources/rules.scala | 25 +++++++-------------
 .../spark/sql/execution/command/DDLSuite.scala  | 17 +++++++++----
 .../spark/sql/hive/HiveExternalCatalog.scala    | 16 +++++++------
 3 files changed, 29 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/fd4ba3f6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 5eb2f0a..f14c63c 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -72,29 +72,20 @@ case class PreprocessDDL(conf: SQLConf) extends 
Rule[LogicalPlan] {
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // When we CREATE TABLE without specifying the table schema, we should 
fail the query if
-    // bucketing information is specified, as we can't infer bucketing from 
data files currently,
-    // and we should ignore the partition columns if it's specified, as we 
will infer it later, at
-    // runtime.
+    // bucketing information is specified, as we can't infer bucketing from 
data files currently.
+    // Since the runtime inferred partition columns could be different from 
what user specified,
+    // we fail the query if the partitioning information is specified.
     case c @ CreateTable(tableDesc, _, None) if tableDesc.schema.isEmpty =>
       if (tableDesc.bucketSpec.isDefined) {
         failAnalysis("Cannot specify bucketing information if the table schema 
is not specified " +
           "when creating and will be inferred at runtime")
       }
-
-      val partitionColumnNames = tableDesc.partitionColumnNames
-      if (partitionColumnNames.nonEmpty) {
-        // The table does not have a specified schema, which means that the 
schema will be inferred
-        // at runtime. So, we are not expecting partition columns and we will 
discover partitions
-        // at runtime. However, if there are specified partition columns, we 
simply ignore them and
-        // provide a warning message.
-        logWarning(
-          s"Specified partition columns 
(${partitionColumnNames.mkString(",")}) will be " +
-            s"ignored. The schema and partition columns of table 
${tableDesc.identifier} will " +
-            "be inferred.")
-        c.copy(tableDesc = tableDesc.copy(partitionColumnNames = Nil))
-      } else {
-        c
+      if (tableDesc.partitionColumnNames.nonEmpty) {
+        failAnalysis("It is not allowed to specify partition columns when the 
table schema is " +
+          "not defined. When the table schema is not provided, schema and 
partition columns " +
+          "will be inferred.")
       }
+      c
 
     // Here we normalize partition, bucket and sort column names, w.r.t. the 
case sensitivity
     // config, and do various checks:

http://git-wip-us.apache.org/repos/asf/spark/blob/fd4ba3f6/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index e6ae422..b343454 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -265,7 +265,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with 
BeforeAndAfterEach {
         userSpecifiedPartitionCols.map(p => s"PARTITIONED BY 
($p)").getOrElse("")
       val schemaClause = userSpecifiedSchema.map(s => s"($s)").getOrElse("")
       val uri = path.toURI
-      sql(
+      val sqlCreateTable =
         s"""
            |CREATE TABLE $tabName $schemaClause
            |USING parquet
@@ -273,11 +273,18 @@ class DDLSuite extends QueryTest with SharedSQLContext 
with BeforeAndAfterEach {
            |  path '$uri'
            |)
            |$partitionClause
-         """.stripMargin)
-      val tableMetadata = 
spark.sessionState.catalog.getTableMetadata(TableIdentifier(tabName))
+         """.stripMargin
+      if (userSpecifiedSchema.isEmpty && userSpecifiedPartitionCols.nonEmpty) {
+        val e = intercept[AnalysisException](sql(sqlCreateTable)).getMessage
+        assert(e.contains(
+          "not allowed to specify partition columns when the table schema is 
not defined"))
+      } else {
+        sql(sqlCreateTable)
+        val tableMetadata = 
spark.sessionState.catalog.getTableMetadata(TableIdentifier(tabName))
 
-      assert(expectedSchema == tableMetadata.schema)
-      assert(expectedPartitionCols == tableMetadata.partitionColumnNames)
+        assert(expectedSchema == tableMetadata.schema)
+        assert(expectedPartitionCols == tableMetadata.partitionColumnNames)
+      }
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/fd4ba3f6/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 2586d11..7f50e38 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -622,24 +622,26 @@ object HiveExternalCatalog {
   def getSchemaFromTableProperties(metadata: CatalogTable): StructType = {
     val errorMessage = "Could not read schema from the hive metastore because 
it is corrupted."
     val props = metadata.properties
-    props.get(DATASOURCE_SCHEMA).map { schema =>
+    val schema = props.get(DATASOURCE_SCHEMA)
+    if (schema.isDefined) {
       // Originally, we used `spark.sql.sources.schema` to store the schema of 
a data source table.
       // After SPARK-6024, we removed this flag.
       // Although we are not using `spark.sql.sources.schema` any more, we 
need to still support.
-      DataType.fromJson(schema).asInstanceOf[StructType]
-    } getOrElse {
-      props.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts =>
-        val parts = (0 until numParts.toInt).map { index =>
+      DataType.fromJson(schema.get).asInstanceOf[StructType]
+    } else {
+      val numSchemaParts = props.get(DATASOURCE_SCHEMA_NUMPARTS)
+      if (numSchemaParts.isDefined) {
+        val parts = (0 until numSchemaParts.get.toInt).map { index =>
           val part = 
metadata.properties.get(s"$DATASOURCE_SCHEMA_PART_PREFIX$index").orNull
           if (part == null) {
             throw new AnalysisException(errorMessage +
-              s" (missing part $index of the schema, $numParts parts are 
expected).")
+              s" (missing part $index of the schema, ${numSchemaParts.get} 
parts are expected).")
           }
           part
         }
         // Stick all parts back to a single schema string.
         DataType.fromJson(parts.mkString).asInstanceOf[StructType]
-      } getOrElse {
+      } else {
         throw new AnalysisException(errorMessage)
       }
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-17192][SQL] Issue Exception when Users Specify the Partitioning Columns without a Given Schema

Reply via email to