This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new deb1d1b86162 [SPARK-46394][SQL] Fix spark.catalog.listDatabases() 
issues on schemas with special characters when 
`spark.sql.legacy.keepCommandOutputSchema` set to true
deb1d1b86162 is described below

commit deb1d1b861626cfcb2522d5907f12b1e69cbfb72
Author: Xinyi Yu <xinyi...@databricks.com>
AuthorDate: Wed Dec 13 22:18:24 2023 -0800

    [SPARK-46394][SQL] Fix spark.catalog.listDatabases() issues on schemas with 
special characters when `spark.sql.legacy.keepCommandOutputSchema` set to true
    
    ### What changes were proposed in this pull request?
    When the SQL conf `spark.sql.legacy.keepCommandOutputSchema` is set to true:
    Before:
    ```
    // support there is a xyyu-db-with-hyphen schema in the catalog
    spark.catalog.listDatabases()
    
    [INVALID_IDENTIFIER] The identifier xyyu-db-with-hyphen is invalid. Please, 
consider quoting it with back-quotes as `xyyu-db-with-hyphen`. SQLSTATE: 42602 
(line 1, pos 4)
    ```
    
    After:
    ```
    spark.catalog.listDatabases()
    
    .. `xyyu-db-with-hyphen` ..
    ```
    
    This PR fixes the issue by falling back to original name when the parsing 
failed.
    
    ### Why are the changes needed?
    To fix the bug.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Newly added tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #44328 from anchovYu/special-char-schema-name-issue.
    
    Authored-by: Xinyi Yu <xinyi...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../apache/spark/sql/internal/CatalogImpl.scala    | 21 ++++++++---
 .../apache/spark/sql/internal/CatalogSuite.scala   | 41 ++++++++++++++++++++++
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index ca1a2d49d72d..a58f5d358b4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
+import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{CreateTable, 
LocalRelation, LogicalPlan, OptionList, RecoverPartitions, ShowFunctions, 
ShowNamespaces, ShowTables, UnresolvedTableSpec, View}
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
 import org.apache.spark.sql.connector.catalog.{CatalogManager, 
SupportsNamespaces, TableCatalog}
@@ -49,8 +50,16 @@ class CatalogImpl(sparkSession: SparkSession) extends 
Catalog {
 
   private def sessionCatalog: SessionCatalog = 
sparkSession.sessionState.catalog
 
-  private def parseIdent(name: String): Seq[String] = {
-    sparkSession.sessionState.sqlParser.parseMultipartIdentifier(name)
+  /**
+   * Helper function for parsing identifiers.
+   * @param fallbackOnException if true, when parsing fails, return the 
original name.
+   */
+  private def parseIdent(name: String, fallbackOnException: Boolean = false): 
Seq[String] = {
+    try {
+      sparkSession.sessionState.sqlParser.parseMultipartIdentifier(name)
+    } catch {
+      case _: ParseException if fallbackOnException => Seq(name)
+    }
   }
 
   private def qualifyV1Ident(nameParts: Seq[String]): Seq[String] = {
@@ -100,7 +109,9 @@ class CatalogImpl(sparkSession: SparkSession) extends 
Catalog {
       case ShowNamespaces(r: ResolvedNamespace, _, _) => r.catalog
     }.get
     val databases = qe.toRdd.collect().map { row =>
-      makeDatabase(Some(catalog.name()), row.getString(0))
+      // dbName can either be a quoted identifier (single or multi part) or an 
unquoted single part
+      val dbName = row.getString(0)
+      makeDatabase(Some(catalog.name()), dbName)
     }
     CatalogImpl.makeDataset(databases.toImmutableArraySeq, sparkSession)
   }
@@ -424,9 +435,11 @@ class CatalogImpl(sparkSession: SparkSession) extends 
Catalog {
     makeDatabase(None, dbName)
   }
 
+  // when catalogName is specified, dbName should be a valid quoted multi-part 
identifier, or a
+  // valid unquoted single part identifier.
   private def makeDatabase(catalogNameOpt: Option[String], dbName: String): 
Database = {
     val idents = catalogNameOpt match {
-      case Some(catalogName) => catalogName +: parseIdent(dbName)
+      case Some(catalogName) => catalogName +: parseIdent(dbName, 
fallbackOnException = true)
       case None => resolveNamespace(dbName)
     }
     val plan = UnresolvedNamespace(idents, fetchMetadata = true)
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
index 6522ebad1a4f..15733d1c8bf6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
@@ -167,6 +167,47 @@ class CatalogSuite extends SharedSparkSession with 
AnalysisTest with BeforeAndAf
       Set("default", "my_db2"))
   }
 
+  test("list databases with special character") {
+    Seq(true, false).foreach { legacy =>
+      withSQLConf(SQLConf.LEGACY_KEEP_COMMAND_OUTPUT_SCHEMA.key -> 
legacy.toString) {
+        spark.catalog.setCurrentCatalog(CatalogManager.SESSION_CATALOG_NAME)
+        assert(spark.catalog.listDatabases().collect().map(_.name).toSet == 
Set("default"))
+        // use externalCatalog to bypass the database name validation in 
SessionCatalog
+        
spark.sharedState.externalCatalog.createDatabase(utils.newDb("my-db1"), false)
+        
spark.sharedState.externalCatalog.createDatabase(utils.newDb("my`db2"), false)
+        assert(spark.catalog.listDatabases().collect().map(_.name).toSet ==
+          Set("default", "`my-db1`", "`my``db2`"))
+        // TODO: ideally there should be no difference between legacy and 
non-legacy mode. However,
+        //  in non-legacy mode, the ShowNamespacesExec does the quoting before 
pattern matching,
+        //  requiring the pattern to be quoted. This is not ideal, we should 
fix it in the future.
+        if (legacy) {
+          assert(
+            spark.catalog.listDatabases("my*").collect().map(_.name).toSet ==
+              Set("`my-db1`", "`my``db2`")
+          )
+          
assert(spark.catalog.listDatabases("`my*`").collect().map(_.name).toSet == 
Set.empty)
+        } else {
+          
assert(spark.catalog.listDatabases("my*").collect().map(_.name).toSet == 
Set.empty)
+          assert(
+            spark.catalog.listDatabases("`my*`").collect().map(_.name).toSet ==
+              Set("`my-db1`", "`my``db2`")
+          )
+        }
+        assert(spark.catalog.listDatabases("you*").collect().map(_.name).toSet 
==
+          Set.empty[String])
+        dropDatabase("my-db1")
+        assert(spark.catalog.listDatabases().collect().map(_.name).toSet ==
+          Set("default", "`my``db2`"))
+        dropDatabase("my`db2") // cleanup
+
+        spark.catalog.setCurrentCatalog("testcat")
+        sql(s"CREATE NAMESPACE testcat.`my-db`")
+        assert(spark.catalog.listDatabases().collect().map(_.name).toSet == 
Set("`my-db`"))
+        sql(s"DROP NAMESPACE testcat.`my-db`") // cleanup
+      }
+    }
+  }
+
   test("list databases with current catalog") {
     spark.catalog.setCurrentCatalog("testcat")
     sql(s"CREATE NAMESPACE testcat.my_db")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to