This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new deb1d1b86162 [SPARK-46394][SQL] Fix spark.catalog.listDatabases()
issues on schemas with special characters when
`spark.sql.legacy.keepCommandOutputSchema` set to true
deb1d1b86162 is described below
commit deb1d1b861626cfcb2522d5907f12b1e69cbfb72
Author: Xinyi Yu <[email protected]>
AuthorDate: Wed Dec 13 22:18:24 2023 -0800
[SPARK-46394][SQL] Fix spark.catalog.listDatabases() issues on schemas with
special characters when `spark.sql.legacy.keepCommandOutputSchema` set to true
### What changes were proposed in this pull request?
When the SQL conf `spark.sql.legacy.keepCommandOutputSchema` is set to true:
Before:
```
// support there is a xyyu-db-with-hyphen schema in the catalog
spark.catalog.listDatabases()
[INVALID_IDENTIFIER] The identifier xyyu-db-with-hyphen is invalid. Please,
consider quoting it with back-quotes as `xyyu-db-with-hyphen`. SQLSTATE: 42602
(line 1, pos 4)
```
After:
```
spark.catalog.listDatabases()
.. `xyyu-db-with-hyphen` ..
```
This PR fixes the issue by falling back to original name when the parsing
failed.
### Why are the changes needed?
To fix the bug.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Newly added tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #44328 from anchovYu/special-char-schema-name-issue.
Authored-by: Xinyi Yu <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../apache/spark/sql/internal/CatalogImpl.scala | 21 ++++++++---
.../apache/spark/sql/internal/CatalogSuite.scala | 41 ++++++++++++++++++++++
2 files changed, 58 insertions(+), 4 deletions(-)
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index ca1a2d49d72d..a58f5d358b4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
+import org.apache.spark.sql.catalyst.parser.ParseException
import org.apache.spark.sql.catalyst.plans.logical.{CreateTable,
LocalRelation, LogicalPlan, OptionList, RecoverPartitions, ShowFunctions,
ShowNamespaces, ShowTables, UnresolvedTableSpec, View}
import org.apache.spark.sql.catalyst.types.DataTypeUtils
import org.apache.spark.sql.connector.catalog.{CatalogManager,
SupportsNamespaces, TableCatalog}
@@ -49,8 +50,16 @@ class CatalogImpl(sparkSession: SparkSession) extends
Catalog {
private def sessionCatalog: SessionCatalog =
sparkSession.sessionState.catalog
- private def parseIdent(name: String): Seq[String] = {
- sparkSession.sessionState.sqlParser.parseMultipartIdentifier(name)
+ /**
+ * Helper function for parsing identifiers.
+ * @param fallbackOnException if true, when parsing fails, return the
original name.
+ */
+ private def parseIdent(name: String, fallbackOnException: Boolean = false):
Seq[String] = {
+ try {
+ sparkSession.sessionState.sqlParser.parseMultipartIdentifier(name)
+ } catch {
+ case _: ParseException if fallbackOnException => Seq(name)
+ }
}
private def qualifyV1Ident(nameParts: Seq[String]): Seq[String] = {
@@ -100,7 +109,9 @@ class CatalogImpl(sparkSession: SparkSession) extends
Catalog {
case ShowNamespaces(r: ResolvedNamespace, _, _) => r.catalog
}.get
val databases = qe.toRdd.collect().map { row =>
- makeDatabase(Some(catalog.name()), row.getString(0))
+ // dbName can either be a quoted identifier (single or multi part) or an
unquoted single part
+ val dbName = row.getString(0)
+ makeDatabase(Some(catalog.name()), dbName)
}
CatalogImpl.makeDataset(databases.toImmutableArraySeq, sparkSession)
}
@@ -424,9 +435,11 @@ class CatalogImpl(sparkSession: SparkSession) extends
Catalog {
makeDatabase(None, dbName)
}
+ // when catalogName is specified, dbName should be a valid quoted multi-part
identifier, or a
+ // valid unquoted single part identifier.
private def makeDatabase(catalogNameOpt: Option[String], dbName: String):
Database = {
val idents = catalogNameOpt match {
- case Some(catalogName) => catalogName +: parseIdent(dbName)
+ case Some(catalogName) => catalogName +: parseIdent(dbName,
fallbackOnException = true)
case None => resolveNamespace(dbName)
}
val plan = UnresolvedNamespace(idents, fetchMetadata = true)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
index 6522ebad1a4f..15733d1c8bf6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
@@ -167,6 +167,47 @@ class CatalogSuite extends SharedSparkSession with
AnalysisTest with BeforeAndAf
Set("default", "my_db2"))
}
+ test("list databases with special character") {
+ Seq(true, false).foreach { legacy =>
+ withSQLConf(SQLConf.LEGACY_KEEP_COMMAND_OUTPUT_SCHEMA.key ->
legacy.toString) {
+ spark.catalog.setCurrentCatalog(CatalogManager.SESSION_CATALOG_NAME)
+ assert(spark.catalog.listDatabases().collect().map(_.name).toSet ==
Set("default"))
+ // use externalCatalog to bypass the database name validation in
SessionCatalog
+
spark.sharedState.externalCatalog.createDatabase(utils.newDb("my-db1"), false)
+
spark.sharedState.externalCatalog.createDatabase(utils.newDb("my`db2"), false)
+ assert(spark.catalog.listDatabases().collect().map(_.name).toSet ==
+ Set("default", "`my-db1`", "`my``db2`"))
+ // TODO: ideally there should be no difference between legacy and
non-legacy mode. However,
+ // in non-legacy mode, the ShowNamespacesExec does the quoting before
pattern matching,
+ // requiring the pattern to be quoted. This is not ideal, we should
fix it in the future.
+ if (legacy) {
+ assert(
+ spark.catalog.listDatabases("my*").collect().map(_.name).toSet ==
+ Set("`my-db1`", "`my``db2`")
+ )
+
assert(spark.catalog.listDatabases("`my*`").collect().map(_.name).toSet ==
Set.empty)
+ } else {
+
assert(spark.catalog.listDatabases("my*").collect().map(_.name).toSet ==
Set.empty)
+ assert(
+ spark.catalog.listDatabases("`my*`").collect().map(_.name).toSet ==
+ Set("`my-db1`", "`my``db2`")
+ )
+ }
+ assert(spark.catalog.listDatabases("you*").collect().map(_.name).toSet
==
+ Set.empty[String])
+ dropDatabase("my-db1")
+ assert(spark.catalog.listDatabases().collect().map(_.name).toSet ==
+ Set("default", "`my``db2`"))
+ dropDatabase("my`db2") // cleanup
+
+ spark.catalog.setCurrentCatalog("testcat")
+ sql(s"CREATE NAMESPACE testcat.`my-db`")
+ assert(spark.catalog.listDatabases().collect().map(_.name).toSet ==
Set("`my-db`"))
+ sql(s"DROP NAMESPACE testcat.`my-db`") // cleanup
+ }
+ }
+ }
+
test("list databases with current catalog") {
spark.catalog.setCurrentCatalog("testcat")
sql(s"CREATE NAMESPACE testcat.my_db")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]