(spark) branch master updated: [SPARK-48760][SQL][DOCS][FOLLOWUP] Add `CLUSTER BY` to doc `sql-ref-syntax-ddl-alter-table.md`

wenchen Tue, 09 Jul 2024 06:19:37 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new fdbacdf0fb21 [SPARK-48760][SQL][DOCS][FOLLOWUP] Add `CLUSTER BY` to 
doc `sql-ref-syntax-ddl-alter-table.md`
fdbacdf0fb21 is described below

commit fdbacdf0fb21873d55cb0de63653ace8363187b9
Author: panbingkun <[email protected]>
AuthorDate: Tue Jul 9 21:18:10 2024 +0800

    [SPARK-48760][SQL][DOCS][FOLLOWUP] Add `CLUSTER BY` to doc 
`sql-ref-syntax-ddl-alter-table.md`
    
    ### What changes were proposed in this pull request?
    The pr is  following up https://github.com/apache/spark/pull/47156, aims to
    - add `CLUSTER BY` to doc `sql-ref-syntax-ddl-alter-table.md`
    - move parser tests from `o.a.s.s.c.p.DDLParserSuite` to 
`AlterTableClusterByParserSuite`
    - use `checkError` to check exception in 
`o.a.s.s.e.c.AlterTableClusterBySuiteBase`
    
    ### Why are the changes needed?
    - Enable the doc `sql-ref-syntax-ddl-alter-table.md` to cover new syntax 
`ALTER TABLE ... CLUSTER BY ...`.
    - Align with other similar tests, eg: AlterTableRename*
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, Make end-users can query the explanation of `CLUSTER BY` through the 
doc `sql-ref-syntax-ddl-alter-table.md`.
    
    ### How was this patch tested?
    Updated UT.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #47254 from panbingkun/SPARK-48760_FOLLOWUP.
    
    Authored-by: panbingkun <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 docs/sql-ref-syntax-ddl-alter-table.md             | 71 ++++++++++++++++++++++
 .../spark/sql/catalyst/parser/DDLParserSuite.scala | 18 ------
 .../catalyst/analysis/ResolveSessionCatalog.scala  |  9 +--
 .../command/AlterTableClusterByParserSuite.scala   | 44 ++++++++++++++
 .../command/AlterTableClusterBySuiteBase.scala     | 33 ++++++----
 .../command/v1/AlterTableClusterBySuite.scala      |  2 +-
 .../command/v2/AlterTableClusterBySuite.scala      |  6 +-
 7 files changed, 142 insertions(+), 41 deletions(-)

diff --git a/docs/sql-ref-syntax-ddl-alter-table.md 
b/docs/sql-ref-syntax-ddl-alter-table.md
index 31eaf659b5c7..adcfa8db06f1 100644
--- a/docs/sql-ref-syntax-ddl-alter-table.md
+++ b/docs/sql-ref-syntax-ddl-alter-table.md
@@ -233,6 +233,32 @@ ALTER TABLE table_identifier DROP [ IF EXISTS ] 
partition_spec [PURGE]
     Partition to be dropped. Note that one can use a typed literal (e.g., 
date'2019-01-02') in the partition spec.
 
     **Syntax:** `PARTITION ( partition_col_name  = partition_col_val [ , ... ] 
)`
+
+#### CLUSTER BY
+
+`ALTER TABLE CLUSTER BY` command can also be used for changing or removing the 
clustering columns for existing tables.
+
+##### Syntax
+
+```sql
+-- Changing Clustering Columns
+ALTER TABLE table_identifier CLUSTER BY ( col_name [ , ... ] )
+
+-- Removing Clustering Columns
+ALTER TABLE table_identifier CLUSTER BY NONE
+```
+
+#### Parameters
+
+* **table_identifier**
+
+  Specifies a table name, which may be optionally qualified with a database 
name.
+
+  **Syntax:** `[ database_name. ] table_name`
+
+* **col_name**
+
+  Specifies the name of the column.
      
 ### SET AND UNSET
 
@@ -596,6 +622,51 @@ SHOW PARTITIONS StudentInfo;
 |   age=20|
 +---------+
 
+-- CLUSTER BY
+DESC Teacher;
++------------------------+---------+-------+
+|                col_name|data_type|comment|
++------------------------+---------+-------+
+|                    name|   string|   NULL|
+|                  gender|   string|   NULL|
+|                 country|   string|   NULL|
+|                     age|      int|   NULL|
+|# Clustering Information|         |       |
+|              # col_name|data_type|comment|
+|                  gender|   string|   NULL|
++------------------------+---------+-------+
+
+ALTER TABLE Teacher CLUSTER BY (gender, country);
+
+-- After changing clustering columns
+DESC Teacher;
++------------------------+---------+-------+
+|                col_name|data_type|comment|
++------------------------+---------+-------+
+|                    name|   string|   NULL|
+|                  gender|   string|   NULL|
+|                 country|   string|   NULL|
+|                     age|      int|   NULL|
+|# Clustering Information|         |       |
+|              # col_name|data_type|comment|
+|                  gender|   string|   NULL|
+|                 country|   string|   NULL|
++------------------------+---------+-------+
+
+ALTER TABLE Teacher CLUSTER BY NONE;
+
+-- After removing clustering columns
+DESC Teacher;
++------------------------+---------+-------+
+|                col_name|data_type|comment|
++------------------------+---------+-------+
+|                    name|   string|   NULL|
+|                  gender|   string|   NULL|
+|                 country|   string|   NULL|
+|                     age|      int|   NULL|
+|# Clustering Information|         |       |
++------------------------+---------+-------+
+
 -- Change the fileformat
 ALTER TABLE loc_orc SET fileformat orc;
 
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
index 70cfa9325419..f88c516f0019 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala
@@ -21,7 +21,6 @@ import java.util.Locale
 
 import org.apache.spark.SparkThrowable
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.catalog.ClusterBySpec
 import org.apache.spark.sql.catalyst.expressions.{EqualTo, Hex, Literal}
 import org.apache.spark.sql.catalyst.plans.logical._
 import 
org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition.{after, first}
@@ -236,23 +235,6 @@ class DDLParserSuite extends AnalysisTest {
     }
   }
 
- test("alter table cluster by") {
-    comparePlans(
-      parsePlan("ALTER TABLE table_name CLUSTER BY (`a.b`, c.d, none)"),
-      AlterTableClusterBy(
-        UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"),
-        Some(ClusterBySpec(Seq(
-          FieldReference(Seq("a.b")),
-          FieldReference(Seq("c", "d")),
-          FieldReference(Seq("none")))))))
-
-    comparePlans(
-      parsePlan("ALTER TABLE table_name CLUSTER BY NONE"),
-      AlterTableClusterBy(
-        UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"),
-        None))
-  }
-
   test("create/replace table - with comment") {
     val createSql = "CREATE TABLE my_tab(a INT, b STRING) USING parquet 
COMMENT 'abc'"
     val replaceSql = "REPLACE TABLE my_tab(a INT, b STRING) USING parquet 
COMMENT 'abc'"
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
index d8fa48a72cf8..1f9419c41b74 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala
@@ -103,13 +103,10 @@ class ResolveSessionCatalog(val catalogManager: 
CatalogManager)
         builder.build())
       AlterTableChangeColumnCommand(table.catalogTable.identifier, colName, 
newColumn)
 
-    case AlterTableClusterBy(ResolvedTable(catalog, ident, table: V1Table, _), 
clusterBySpecOpt)
+    case AlterTableClusterBy(ResolvedTable(catalog, _, table: V1Table, _), 
clusterBySpecOpt)
         if isSessionCatalog(catalog) =>
-      val prop = clusterBySpecOpt.map { clusterBySpec =>
-        Map(ClusterBySpec.toProperty(table.schema, clusterBySpec, 
conf.resolver))
-      }.getOrElse {
-        Map(ClusterBySpec.toProperty(table.schema, ClusterBySpec(Nil), 
conf.resolver))
-      }
+      val prop = Map(ClusterBySpec.toProperty(table.schema,
+        clusterBySpecOpt.getOrElse(ClusterBySpec(Nil)), conf.resolver))
       AlterTableSetPropertiesCommand(table.catalogTable.identifier, prop, 
isView = false)
 
     case RenameColumn(ResolvedV1TableIdentifier(ident), _, _) =>
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterByParserSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterByParserSuite.scala
new file mode 100644
index 000000000000..1bbb3eb35fed
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterByParserSuite.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedTable}
+import org.apache.spark.sql.catalyst.catalog.ClusterBySpec
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan
+import org.apache.spark.sql.catalyst.plans.logical.AlterTableClusterBy
+import org.apache.spark.sql.connector.expressions.FieldReference
+import org.apache.spark.sql.test.SharedSparkSession
+
+class AlterTableClusterByParserSuite extends AnalysisTest with 
SharedSparkSession {
+
+  test("alter table cluster by") {
+    comparePlans(
+      parsePlan("ALTER TABLE table_name CLUSTER BY (`a.b`, c.d, none)"),
+      AlterTableClusterBy(
+        UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"),
+        Some(ClusterBySpec(Seq(
+          FieldReference(Seq("a.b")),
+          FieldReference(Seq("c", "d")),
+          FieldReference(Seq("none")))))))
+
+    comparePlans(
+      parsePlan("ALTER TABLE table_name CLUSTER BY NONE"),
+      AlterTableClusterBy(
+        UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"),
+        None))
+  }
+}
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala
index 8961019f3f8d..73a80cd91069 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala
@@ -45,34 +45,33 @@ trait AlterTableClusterBySuiteBase extends QueryTest with 
DDLCommandTestUtils {
 
   test("test basic ALTER TABLE with clustering columns") {
     withNamespaceAndTable("ns", "table") { tbl =>
-      spark.sql(s"CREATE TABLE $tbl (id INT, data STRING) $defaultUsing 
CLUSTER BY (id, data)")
+      sql(s"CREATE TABLE $tbl (id INT, data STRING) $defaultUsing CLUSTER BY 
(id, data)")
       validateClusterBy(tbl, Seq("id", "data"))
-      spark.sql(s"ALTER TABLE $tbl CLUSTER BY (data, id)")
+      sql(s"ALTER TABLE $tbl CLUSTER BY (data, id)")
       validateClusterBy(tbl, Seq("data", "id"))
-      spark.sql(s"ALTER TABLE $tbl CLUSTER BY NONE")
+      sql(s"ALTER TABLE $tbl CLUSTER BY NONE")
       validateClusterBy(tbl, Seq.empty)
-      spark.sql(s"ALTER TABLE $tbl CLUSTER BY (id)")
+      sql(s"ALTER TABLE $tbl CLUSTER BY (id)")
       validateClusterBy(tbl, Seq("id"))
     }
   }
 
   test("test clustering columns with comma") {
     withNamespaceAndTable("ns", "table") { tbl =>
-      spark.sql(s"CREATE TABLE $tbl (`i,d` INT, data STRING) $defaultUsing " +
-        "CLUSTER BY (`i,d`, data)")
+      sql(s"CREATE TABLE $tbl (`i,d` INT, data STRING) $defaultUsing CLUSTER 
BY (`i,d`, data)")
       validateClusterBy(tbl, Seq("`i,d`", "data"))
-      spark.sql(s"ALTER TABLE $tbl CLUSTER BY (data, `i,d`)")
+      sql(s"ALTER TABLE $tbl CLUSTER BY (data, `i,d`)")
       validateClusterBy(tbl, Seq("data", "`i,d`"))
     }
   }
 
   test("test nested clustering columns") {
     withNamespaceAndTable("ns", "table") { tbl =>
-      spark.sql(s"CREATE TABLE $tbl " +
+      sql(s"CREATE TABLE $tbl " +
         s"($nestedColumnSchema) " +
         s"$defaultUsing CLUSTER BY (${nestedClusteringColumns.mkString(",")})")
       validateClusterBy(tbl, nestedClusteringColumns)
-      spark.sql(s"ALTER TABLE $tbl CLUSTER BY 
(${nestedClusteringColumnsNew.mkString(",")})")
+      sql(s"ALTER TABLE $tbl CLUSTER BY 
(${nestedClusteringColumnsNew.mkString(",")})")
       validateClusterBy(tbl, nestedClusteringColumnsNew)
     }
   }
@@ -80,10 +79,18 @@ trait AlterTableClusterBySuiteBase extends QueryTest with 
DDLCommandTestUtils {
   test("clustering columns not defined in schema") {
     withNamespaceAndTable("ns", "table") { tbl =>
       sql(s"CREATE TABLE $tbl (id bigint, data string) $defaultUsing CLUSTER 
BY (id)")
-      val err = intercept[AnalysisException] {
-        sql(s"ALTER TABLE $tbl CLUSTER BY (unknown)")
-      }
-      assert(err.message.contains("Couldn't find column unknown in:"))
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql(s"ALTER TABLE $tbl CLUSTER BY (unknown)")
+        },
+        errorClass = "_LEGACY_ERROR_TEMP_3060",
+        parameters = Map("i" -> "unknown",
+          "schema" ->
+            """root
+              | |-- id: long (nullable = true)
+              | |-- data: string (nullable = true)
+              |""".stripMargin)
+      )
     }
   }
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala
index 385ef961127a..f5f2b7517b48 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala
@@ -36,7 +36,7 @@ trait AlterTableClusterBySuiteBase extends 
command.AlterTableClusterBySuiteBase
   override def validateClusterBy(tableName: String, clusteringColumns: 
Seq[String]): Unit = {
     val catalog = spark.sessionState.catalog
     val (_, db, t) = parseTableName(tableName)
-    val table = catalog.getTableMetadata(TableIdentifier.apply(t, Some(db)))
+    val table = catalog.getTableMetadata(TableIdentifier(t, Some(db)))
     assert(table.clusterBySpec === 
Some(ClusterBySpec(clusteringColumns.map(FieldReference(_)))))
   }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala
index bbbe6cd75875..8a74d9c3572b 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala
@@ -40,13 +40,13 @@ class AlterTableClusterBySuite extends 
command.AlterTableClusterBySuiteBase
 
   test("test REPLACE TABLE with clustering columns") {
     withNamespaceAndTable("ns", "table") { tbl =>
-      spark.sql(s"CREATE TABLE $tbl (id INT) $defaultUsing CLUSTER BY (id)")
+      sql(s"CREATE TABLE $tbl (id INT) $defaultUsing CLUSTER BY (id)")
       validateClusterBy(tbl, Seq("id"))
 
-      spark.sql(s"REPLACE TABLE $tbl (id INT, id2 INT) $defaultUsing CLUSTER 
BY (id2)")
+      sql(s"REPLACE TABLE $tbl (id INT, id2 INT) $defaultUsing CLUSTER BY 
(id2)")
       validateClusterBy(tbl, Seq("id2"))
 
-      spark.sql(s"ALTER TABLE $tbl CLUSTER BY (id)")
+      sql(s"ALTER TABLE $tbl CLUSTER BY (id)")
       validateClusterBy(tbl, Seq("id"))
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-48760][SQL][DOCS][FOLLOWUP] Add `CLUSTER BY` to doc `sql-ref-syntax-ddl-alter-table.md`

Reply via email to